ypdate search

This commit is contained in:
Parsa Nazer
2026-05-30 09:06:25 +03:30
parent e6965fe3b8
commit 6f2037309c
2 changed files with 288 additions and 19 deletions
@@ -0,0 +1,102 @@
"""
Add an IMMUTABLE ``normalize_persian(text)`` SQL function and GIN trigram
indexes that match it. Lets the search view filter and rank without doing a
per-row ``translate()`` on a sequential scan — drops query time from seconds
to tens of milliseconds.
The FROM/TO strings here MUST stay aligned with ``_SQL_NORM_FROM`` /
``_SQL_NORM_TO`` in ``product/views.py``. If you change one, change the other
and add a follow-up migration that recreates the function + indexes (Postgres
matches expression indexes by exact SQL form, so a stale function would
silently bypass the indexes).
"""
from django.db import migrations
# Mirror of product.views._SQL_NORM_FROM / _SQL_NORM_TO.
_SQL_NORM_FROM = (
'يك' # Arabic ya/kaf -> Persian
'ﻱﻲﻳﻴ' # Arabic ya presentation forms
'ﻙﻚﻛﻜ' # Arabic kaf presentation forms
'آأإٱ' # alef variants
'ؤ' # waw with hamza
'ئ' # ya with hamza
'ةۀ' # ta marbuta / he with hamza
'ﻩﻪﻫﻬ' # he presentation forms
'' # ZWNJ, ZWJ -> space
'۰۱۲۳۴۵۶۷۸۹' # Persian digits
'٠١٢٣٤٥٦٧٨٩' # Arabic-Indic digits
# Deletions (no matching char in TO):
'ـ' # tatweel
'' # LRM, RLM
'ًٌٍَُِّْ' # tashkeel
)
_SQL_NORM_TO = (
'یک'
'یییی'
'کککک'
'اااا'
'و'
'ی'
'هه'
'هههه'
' '
'0123456789'
'0123456789'
)
def _pg_str(s):
"""Quote a Python string as a PostgreSQL string literal."""
return "'" + s.replace("'", "''") + "'"
CREATE_FUNCTION_SQL = f"""
CREATE OR REPLACE FUNCTION normalize_persian(t text) RETURNS text AS $$
SELECT lower(translate(t, {_pg_str(_SQL_NORM_FROM)}, {_pg_str(_SQL_NORM_TO)}));
$$ LANGUAGE SQL IMMUTABLE STRICT PARALLEL SAFE;
"""
DROP_FUNCTION_SQL = "DROP FUNCTION IF EXISTS normalize_persian(text);"
class Migration(migrations.Migration):
dependencies = [
("product", "0075_productvariant_guarantee"),
]
operations = [
migrations.RunSQL(
sql=CREATE_FUNCTION_SQL,
reverse_sql=DROP_FUNCTION_SQL,
),
# GIN trigram indexes on the normalized expression. PostgreSQL matches
# queries that use exactly ``normalize_persian(<col>)`` against these
# indexes, so the views.py wrapper must call the SQL function (not
# inline translate/lower) for the index to be used.
migrations.RunSQL(
sql=(
"CREATE INDEX IF NOT EXISTS product_norm_name_trgm_idx "
"ON product_productmodel "
"USING gin (normalize_persian(name) gin_trgm_ops);"
),
reverse_sql="DROP INDEX IF EXISTS product_norm_name_trgm_idx;",
),
migrations.RunSQL(
sql=(
"CREATE INDEX IF NOT EXISTS product_norm_keywords_trgm_idx "
"ON product_productmodel "
"USING gin (normalize_persian(meta_keywords) gin_trgm_ops);"
),
reverse_sql="DROP INDEX IF EXISTS product_norm_keywords_trgm_idx;",
),
migrations.RunSQL(
sql=(
"CREATE INDEX IF NOT EXISTS subcategory_norm_name_trgm_idx "
"ON product_subcategorymodel "
"USING gin (normalize_persian(name) gin_trgm_ops);"
),
reverse_sql="DROP INDEX IF EXISTS subcategory_norm_name_trgm_idx;",
),
]
+186 -19
View File
@@ -1,3 +1,4 @@
import re
from .models import ProductModel
from rest_framework import serializers
from django.core.paginator import Paginator
@@ -6,8 +7,8 @@ from .models import *
from .serializers import *
from rest_framework import status
from rest_framework.response import Response
from django.db.models import Q, Value
from django.db.models.functions import Coalesce
from django.db.models import Q, Value, Case, When, FloatField, F, CharField, Func
from django.db.models.functions import Coalesce, Length
from django.contrib.postgres.search import TrigramSimilarity
from django.shortcuts import get_object_or_404
from rest_framework.permissions import IsAuthenticatedOrReadOnly
@@ -21,6 +22,179 @@ from home.models import ShowCaseSlider
from home.serializers import ShowCaseSliderSerialzier
from order.models import Cart, CartItem
from django.db.models import Min, Max, Value
_PERSIAN_CHAR_MAP = str.maketrans({
# Arabic letters -> Persian equivalents
'ي': 'ی', 'ك': 'ک',
# Arabic ya/kaf presentation forms -> Persian
'': 'ی', '': 'ی', '': 'ی', '': 'ی',
'': 'ک', '': 'ک', '': 'ک', '': 'ک',
# Alef variants -> bare alef (so "ایفون" matches "آیفون")
'آ': 'ا', 'أ': 'ا', 'إ': 'ا', 'ٱ': 'ا',
# Hamza on waw/ya -> bare letter
'ؤ': 'و',
'ئ': 'ی',
# Ta marbuta / he variants -> he
'ة': 'ه', 'ۀ': 'ه',
'': 'ه', '': 'ه', '': 'ه', '': 'ه',
# Tatweel - drop
'ـ': '',
# Tashkeel (diacritics) - drop
'ً': '', 'ٌ': '', 'ٍ': '', 'َ': '', 'ُ': '', 'ِ': '', 'ّ': '', 'ْ': '',
# Zero-width / direction marks
'': ' ', '': ' ',
'': '', '': '',
# Arabic-Indic / Persian digits -> ASCII
'۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
'۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9',
'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
'٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
})
def _normalize_search_text(text):
"""Normalize a search string to handle Persian/Arabic variants, ZWNJ, and case."""
if not text:
return ''
return re.sub(r'\s+', ' ', text.translate(_PERSIAN_CHAR_MAP)).strip().lower()
# SQL-side equivalent of _PERSIAN_CHAR_MAP for PostgreSQL translate().
# Each char at position i in FROM is replaced by char at position i in TO;
# chars past len(TO) are deleted entirely. This must mirror the Python map so
# stored values and query strings normalize to the same form.
_SQL_NORM_FROM = (
'يك' # Arabic ya/kaf -> Persian
'ﻱﻲﻳﻴ' # Arabic ya presentation forms
'ﻙﻚﻛﻜ' # Arabic kaf presentation forms
'آأإٱ' # alef variants
'ؤ' # waw with hamza
'ئ' # ya with hamza
'ةۀ' # ta marbuta / he with hamza
'ﻩﻪﻫﻬ' # he presentation forms
'' # ZWNJ, ZWJ -> space
'۰۱۲۳۴۵۶۷۸۹' # Persian digits
'٠١٢٣٤٥٦٧٨٩' # Arabic-Indic digits
# Deletions (no matching char in TO):
'ـ' # tatweel
'' # LRM, RLM
'ًٌٍَُِّْ' # tashkeel
)
_SQL_NORM_TO = (
'یک'
'یییی'
'کککک'
'اااا'
'و'
'ی'
'هه'
'هههه'
' '
'0123456789'
'0123456789'
)
def NormalizePersian(expression):
"""SQL expression that calls the ``normalize_persian(text)`` Postgres function.
The function (defined in migration 0076) computes ``lower(translate(t, FROM, TO))``
and is marked IMMUTABLE so GIN trigram indexes on ``normalize_persian(name)``
etc. can be matched by the planner. Calling the function (instead of inlining
translate/lower) is what lets queries use those indexes — otherwise every
search is a full sequential scan.
"""
return Func(expression, function='normalize_persian', output_field=CharField())
def _apply_product_search(queryset, search_query):
"""Filter and rank a Product queryset by a (possibly Persian) search query.
Returns (queryset, normalized_query). The queryset is annotated with
``similarity`` so callers can ``order_by('-similarity', ...)``. When no
product strictly matches, falls back to a looser similarity-based filter
so the user sees suggestions instead of an empty page.
"""
normalized_query = _normalize_search_text(search_query) if search_query else ''
if not normalized_query:
return queryset, ''
tokens = [t for t in normalized_query.split(' ') if len(t) >= 2]
annotated = queryset.annotate(
norm_name=NormalizePersian('name'),
norm_keywords=NormalizePersian(Coalesce('meta_keywords', Value(''))),
norm_category=NormalizePersian(Coalesce('category__name', Value(''))),
norm_desc=NormalizePersian(Coalesce('description', Value(''))),
).annotate(
name_sim=TrigramSimilarity(F('norm_name'), normalized_query),
keywords_sim=TrigramSimilarity(F('norm_keywords'), normalized_query),
category_sim=TrigramSimilarity(F('norm_category'), normalized_query),
desc_sim=TrigramSimilarity(F('norm_desc'), normalized_query),
).annotate(
# Word-boundary aware bonuses. The space-padded variants are what make
# "چای" rank above "چایساز" — the former matches "چای " (word boundary)
# while the latter only matches the glued prefix.
#
# Uses case-sensitive lookups (__contains, not __icontains) because both
# sides are already lowercased: __icontains would wrap the expression in
# UPPER(...) and break the GIN trigram index match.
match_bonus=Case(
When(norm_name__exact=normalized_query, then=Value(10.0)),
When(norm_name__startswith=normalized_query + ' ', then=Value(6.0)),
When(norm_name__startswith=normalized_query, then=Value(3.5)),
When(norm_name__contains=' ' + normalized_query + ' ', then=Value(3.0)),
When(norm_name__contains=' ' + normalized_query, then=Value(2.5)),
When(norm_name__contains=normalized_query + ' ', then=Value(2.5)),
When(norm_name__contains=normalized_query, then=Value(1.5)),
default=Value(0.0),
output_field=FloatField(),
)
).annotate(
similarity=(
F('match_bonus')
+ F('name_sim') * Value(2.0)
+ F('keywords_sim') * Value(0.8)
+ F('category_sim') * Value(0.4)
+ F('desc_sim') * Value(0.15)
)
)
if tokens:
# Token AND filter. Limited to fields we have GIN trigram indexes for
# (name, keywords, category.name in migration 0076) — including
# description or slug here would force a sequential scan on the OR
# branch and undo the index speedup. Description still contributes via
# ``desc_sim`` to ranking on the already-narrowed result set.
token_filter = Q()
for token in tokens:
token_filter &= (
Q(norm_name__contains=token)
| Q(norm_keywords__contains=token)
| Q(norm_category__contains=token)
)
strict_filter = (
token_filter
| Q(name_sim__gte=0.45)
| Q(keywords_sim__gte=0.5)
)
else:
strict_filter = Q(name_sim__gte=0.4) | Q(keywords_sim__gte=0.4)
strict_products = annotated.filter(strict_filter).distinct()
if strict_products.exists():
return strict_products, normalized_query
# No strict matches — relax thresholds so the user gets "similar"
# suggestions instead of an empty result page.
loose_filter = (
Q(name_sim__gte=0.18)
| Q(keywords_sim__gte=0.22)
| Q(category_sim__gte=0.3)
| Q(match_bonus__gt=0)
)
return annotated.filter(loose_filter).distinct(), normalized_query
# class APIView(APIView):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
@@ -324,18 +498,9 @@ class AllProductsView(APIView):
status=status.HTTP_400_BAD_REQUEST
)
# Search
# Search (Persian-aware, with typo tolerance + similar-results fallback)
search_query = request.query_params.get('search')
if search_query:
products = products.annotate(
similarity=(
TrigramSimilarity('name', search_query) +
TrigramSimilarity(
Coalesce('description', Value('')),
search_query
)
)
).filter(similarity__gt=0.1)
products, normalized_query = _apply_product_search(products, search_query)
# Price annotation (IMPORTANT for sorting)
products = products.annotate(
@@ -376,8 +541,10 @@ class AllProductsView(APIView):
elif sort_by in ['price', '-price']:
products = products.order_by('min_price' if sort_by == 'price' else '-min_price')
elif search_query:
products = products.order_by('-similarity', 'name')
elif normalized_query:
# Tie-break on shorter name: ensures "چای" outranks "چای ساز"
# when their bonus-adjusted similarities are close.
products = products.order_by('-similarity', Length('norm_name'), 'name')
else:
products = products.order_by('name')
@@ -522,11 +689,9 @@ class ShowCaseProductsView(APIView):
if has_discount:
products = products.filter(variants__discount__gt=0).distinct()
# Search filter
# Search filter (Persian-aware, with typo tolerance + similar-results fallback)
search_query = request.query_params.get('search', None)
if search_query:
products = products.filter(Q(name__icontains=search_query) | Q(
description__icontains=search_query))
products, normalized_query = _apply_product_search(products, search_query)
# Price filters
price_gte = request.query_params.get('price_gte', None)
@@ -543,6 +708,8 @@ class ShowCaseProductsView(APIView):
sort_by = request.query_params.get('sort', None)
if sort_by in ['name', '-name', 'created_at', '-created_at']:
products = products.order_by(sort_by)
elif normalized_query:
products = products.order_by('-similarity', Length('norm_name'), 'name')
else:
products = products.order_by('name')