diff --git a/backend/product/migrations/0076_normalize_persian_search.py b/backend/product/migrations/0076_normalize_persian_search.py new file mode 100644 index 0000000..248164b --- /dev/null +++ b/backend/product/migrations/0076_normalize_persian_search.py @@ -0,0 +1,102 @@ +""" +Add an IMMUTABLE ``normalize_persian(text)`` SQL function and GIN trigram +indexes that match it. Lets the search view filter and rank without doing a +per-row ``translate()`` on a sequential scan — drops query time from seconds +to tens of milliseconds. + +The FROM/TO strings here MUST stay aligned with ``_SQL_NORM_FROM`` / +``_SQL_NORM_TO`` in ``product/views.py``. If you change one, change the other +and add a follow-up migration that recreates the function + indexes (Postgres +matches expression indexes by exact SQL form, so a stale function would +silently bypass the indexes). +""" +from django.db import migrations + + +# Mirror of product.views._SQL_NORM_FROM / _SQL_NORM_TO. +_SQL_NORM_FROM = ( + 'يك' # Arabic ya/kaf -> Persian + 'ﻱﻲﻳﻴ' # Arabic ya presentation forms + 'ﻙﻚﻛﻜ' # Arabic kaf presentation forms + 'آأإٱ' # alef variants + 'ؤ' # waw with hamza + 'ئ' # ya with hamza + 'ةۀ' # ta marbuta / he with hamza + 'ﻩﻪﻫﻬ' # he presentation forms + '‌‍' # ZWNJ, ZWJ -> space + '۰۱۲۳۴۵۶۷۸۹' # Persian digits + '٠١٢٣٤٥٦٧٨٩' # Arabic-Indic digits + # Deletions (no matching char in TO): + 'ـ' # tatweel + '‎‏' # LRM, RLM + 'ًٌٍَُِّْ' # tashkeel +) +_SQL_NORM_TO = ( + 'یک' + 'یییی' + 'کککک' + 'اااا' + 'و' + 'ی' + 'هه' + 'هههه' + ' ' + '0123456789' + '0123456789' +) + + +def _pg_str(s): + """Quote a Python string as a PostgreSQL string literal.""" + return "'" + s.replace("'", "''") + "'" + + +CREATE_FUNCTION_SQL = f""" +CREATE OR REPLACE FUNCTION normalize_persian(t text) RETURNS text AS $$ + SELECT lower(translate(t, {_pg_str(_SQL_NORM_FROM)}, {_pg_str(_SQL_NORM_TO)})); +$$ LANGUAGE SQL IMMUTABLE STRICT PARALLEL SAFE; +""" + +DROP_FUNCTION_SQL = "DROP FUNCTION IF EXISTS normalize_persian(text);" + + +class Migration(migrations.Migration): + + dependencies = [ + ("product", "0075_productvariant_guarantee"), + ] + + operations = [ + migrations.RunSQL( + sql=CREATE_FUNCTION_SQL, + reverse_sql=DROP_FUNCTION_SQL, + ), + # GIN trigram indexes on the normalized expression. PostgreSQL matches + # queries that use exactly ``normalize_persian()`` against these + # indexes, so the views.py wrapper must call the SQL function (not + # inline translate/lower) for the index to be used. + migrations.RunSQL( + sql=( + "CREATE INDEX IF NOT EXISTS product_norm_name_trgm_idx " + "ON product_productmodel " + "USING gin (normalize_persian(name) gin_trgm_ops);" + ), + reverse_sql="DROP INDEX IF EXISTS product_norm_name_trgm_idx;", + ), + migrations.RunSQL( + sql=( + "CREATE INDEX IF NOT EXISTS product_norm_keywords_trgm_idx " + "ON product_productmodel " + "USING gin (normalize_persian(meta_keywords) gin_trgm_ops);" + ), + reverse_sql="DROP INDEX IF EXISTS product_norm_keywords_trgm_idx;", + ), + migrations.RunSQL( + sql=( + "CREATE INDEX IF NOT EXISTS subcategory_norm_name_trgm_idx " + "ON product_subcategorymodel " + "USING gin (normalize_persian(name) gin_trgm_ops);" + ), + reverse_sql="DROP INDEX IF EXISTS subcategory_norm_name_trgm_idx;", + ), + ] diff --git a/backend/product/views.py b/backend/product/views.py index 3bfe627..3802c72 100644 --- a/backend/product/views.py +++ b/backend/product/views.py @@ -1,3 +1,4 @@ +import re from .models import ProductModel from rest_framework import serializers from django.core.paginator import Paginator @@ -6,8 +7,8 @@ from .models import * from .serializers import * from rest_framework import status from rest_framework.response import Response -from django.db.models import Q, Value -from django.db.models.functions import Coalesce +from django.db.models import Q, Value, Case, When, FloatField, F, CharField, Func +from django.db.models.functions import Coalesce, Length from django.contrib.postgres.search import TrigramSimilarity from django.shortcuts import get_object_or_404 from rest_framework.permissions import IsAuthenticatedOrReadOnly @@ -21,6 +22,179 @@ from home.models import ShowCaseSlider from home.serializers import ShowCaseSliderSerialzier from order.models import Cart, CartItem from django.db.models import Min, Max, Value + + +_PERSIAN_CHAR_MAP = str.maketrans({ + # Arabic letters -> Persian equivalents + 'ي': 'ی', 'ك': 'ک', + # Arabic ya/kaf presentation forms -> Persian + 'ﻱ': 'ی', 'ﻲ': 'ی', 'ﻳ': 'ی', 'ﻴ': 'ی', + 'ﻙ': 'ک', 'ﻚ': 'ک', 'ﻛ': 'ک', 'ﻜ': 'ک', + # Alef variants -> bare alef (so "ایفون" matches "آیفون") + 'آ': 'ا', 'أ': 'ا', 'إ': 'ا', 'ٱ': 'ا', + # Hamza on waw/ya -> bare letter + 'ؤ': 'و', + 'ئ': 'ی', + # Ta marbuta / he variants -> he + 'ة': 'ه', 'ۀ': 'ه', + 'ﻩ': 'ه', 'ﻪ': 'ه', 'ﻫ': 'ه', 'ﻬ': 'ه', + # Tatweel - drop + 'ـ': '', + # Tashkeel (diacritics) - drop + 'ً': '', 'ٌ': '', 'ٍ': '', 'َ': '', 'ُ': '', 'ِ': '', 'ّ': '', 'ْ': '', + # Zero-width / direction marks + '‌': ' ', '‍': ' ', + '‎': '', '‏': '', + # Arabic-Indic / Persian digits -> ASCII + '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4', + '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9', + '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4', + '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9', +}) + + +def _normalize_search_text(text): + """Normalize a search string to handle Persian/Arabic variants, ZWNJ, and case.""" + if not text: + return '' + return re.sub(r'\s+', ' ', text.translate(_PERSIAN_CHAR_MAP)).strip().lower() + + +# SQL-side equivalent of _PERSIAN_CHAR_MAP for PostgreSQL translate(). +# Each char at position i in FROM is replaced by char at position i in TO; +# chars past len(TO) are deleted entirely. This must mirror the Python map so +# stored values and query strings normalize to the same form. +_SQL_NORM_FROM = ( + 'يك' # Arabic ya/kaf -> Persian + 'ﻱﻲﻳﻴ' # Arabic ya presentation forms + 'ﻙﻚﻛﻜ' # Arabic kaf presentation forms + 'آأإٱ' # alef variants + 'ؤ' # waw with hamza + 'ئ' # ya with hamza + 'ةۀ' # ta marbuta / he with hamza + 'ﻩﻪﻫﻬ' # he presentation forms + '‌‍' # ZWNJ, ZWJ -> space + '۰۱۲۳۴۵۶۷۸۹' # Persian digits + '٠١٢٣٤٥٦٧٨٩' # Arabic-Indic digits + # Deletions (no matching char in TO): + 'ـ' # tatweel + '‎‏' # LRM, RLM + 'ًٌٍَُِّْ' # tashkeel +) +_SQL_NORM_TO = ( + 'یک' + 'یییی' + 'کککک' + 'اااا' + 'و' + 'ی' + 'هه' + 'هههه' + ' ' + '0123456789' + '0123456789' +) + + +def NormalizePersian(expression): + """SQL expression that calls the ``normalize_persian(text)`` Postgres function. + + The function (defined in migration 0076) computes ``lower(translate(t, FROM, TO))`` + and is marked IMMUTABLE so GIN trigram indexes on ``normalize_persian(name)`` + etc. can be matched by the planner. Calling the function (instead of inlining + translate/lower) is what lets queries use those indexes — otherwise every + search is a full sequential scan. + """ + return Func(expression, function='normalize_persian', output_field=CharField()) + + +def _apply_product_search(queryset, search_query): + """Filter and rank a Product queryset by a (possibly Persian) search query. + + Returns (queryset, normalized_query). The queryset is annotated with + ``similarity`` so callers can ``order_by('-similarity', ...)``. When no + product strictly matches, falls back to a looser similarity-based filter + so the user sees suggestions instead of an empty page. + """ + normalized_query = _normalize_search_text(search_query) if search_query else '' + if not normalized_query: + return queryset, '' + + tokens = [t for t in normalized_query.split(' ') if len(t) >= 2] + + annotated = queryset.annotate( + norm_name=NormalizePersian('name'), + norm_keywords=NormalizePersian(Coalesce('meta_keywords', Value(''))), + norm_category=NormalizePersian(Coalesce('category__name', Value(''))), + norm_desc=NormalizePersian(Coalesce('description', Value(''))), + ).annotate( + name_sim=TrigramSimilarity(F('norm_name'), normalized_query), + keywords_sim=TrigramSimilarity(F('norm_keywords'), normalized_query), + category_sim=TrigramSimilarity(F('norm_category'), normalized_query), + desc_sim=TrigramSimilarity(F('norm_desc'), normalized_query), + ).annotate( + # Word-boundary aware bonuses. The space-padded variants are what make + # "چای" rank above "چایساز" — the former matches "چای " (word boundary) + # while the latter only matches the glued prefix. + # + # Uses case-sensitive lookups (__contains, not __icontains) because both + # sides are already lowercased: __icontains would wrap the expression in + # UPPER(...) and break the GIN trigram index match. + match_bonus=Case( + When(norm_name__exact=normalized_query, then=Value(10.0)), + When(norm_name__startswith=normalized_query + ' ', then=Value(6.0)), + When(norm_name__startswith=normalized_query, then=Value(3.5)), + When(norm_name__contains=' ' + normalized_query + ' ', then=Value(3.0)), + When(norm_name__contains=' ' + normalized_query, then=Value(2.5)), + When(norm_name__contains=normalized_query + ' ', then=Value(2.5)), + When(norm_name__contains=normalized_query, then=Value(1.5)), + default=Value(0.0), + output_field=FloatField(), + ) + ).annotate( + similarity=( + F('match_bonus') + + F('name_sim') * Value(2.0) + + F('keywords_sim') * Value(0.8) + + F('category_sim') * Value(0.4) + + F('desc_sim') * Value(0.15) + ) + ) + + if tokens: + # Token AND filter. Limited to fields we have GIN trigram indexes for + # (name, keywords, category.name in migration 0076) — including + # description or slug here would force a sequential scan on the OR + # branch and undo the index speedup. Description still contributes via + # ``desc_sim`` to ranking on the already-narrowed result set. + token_filter = Q() + for token in tokens: + token_filter &= ( + Q(norm_name__contains=token) + | Q(norm_keywords__contains=token) + | Q(norm_category__contains=token) + ) + strict_filter = ( + token_filter + | Q(name_sim__gte=0.45) + | Q(keywords_sim__gte=0.5) + ) + else: + strict_filter = Q(name_sim__gte=0.4) | Q(keywords_sim__gte=0.4) + + strict_products = annotated.filter(strict_filter).distinct() + if strict_products.exists(): + return strict_products, normalized_query + + # No strict matches — relax thresholds so the user gets "similar" + # suggestions instead of an empty result page. + loose_filter = ( + Q(name_sim__gte=0.18) + | Q(keywords_sim__gte=0.22) + | Q(category_sim__gte=0.3) + | Q(match_bonus__gt=0) + ) + return annotated.filter(loose_filter).distinct(), normalized_query # class APIView(APIView): # def __init__(self, *args, **kwargs): # super().__init__(*args, **kwargs) @@ -324,18 +498,9 @@ class AllProductsView(APIView): status=status.HTTP_400_BAD_REQUEST ) - # Search + # Search (Persian-aware, with typo tolerance + similar-results fallback) search_query = request.query_params.get('search') - if search_query: - products = products.annotate( - similarity=( - TrigramSimilarity('name', search_query) + - TrigramSimilarity( - Coalesce('description', Value('')), - search_query - ) - ) - ).filter(similarity__gt=0.1) + products, normalized_query = _apply_product_search(products, search_query) # Price annotation (IMPORTANT for sorting) products = products.annotate( @@ -376,8 +541,10 @@ class AllProductsView(APIView): elif sort_by in ['price', '-price']: products = products.order_by('min_price' if sort_by == 'price' else '-min_price') - elif search_query: - products = products.order_by('-similarity', 'name') + elif normalized_query: + # Tie-break on shorter name: ensures "چای" outranks "چای ساز" + # when their bonus-adjusted similarities are close. + products = products.order_by('-similarity', Length('norm_name'), 'name') else: products = products.order_by('name') @@ -522,11 +689,9 @@ class ShowCaseProductsView(APIView): if has_discount: products = products.filter(variants__discount__gt=0).distinct() - # Search filter + # Search filter (Persian-aware, with typo tolerance + similar-results fallback) search_query = request.query_params.get('search', None) - if search_query: - products = products.filter(Q(name__icontains=search_query) | Q( - description__icontains=search_query)) + products, normalized_query = _apply_product_search(products, search_query) # Price filters price_gte = request.query_params.get('price_gte', None) @@ -543,6 +708,8 @@ class ShowCaseProductsView(APIView): sort_by = request.query_params.get('sort', None) if sort_by in ['name', '-name', 'created_at', '-created_at']: products = products.order_by(sort_by) + elif normalized_query: + products = products.order_by('-similarity', Length('norm_name'), 'name') else: products = products.order_by('name')