ypdate search

2026-05-30 09:06:25 +03:30
parent e6965fe3b8
commit 6f2037309c
2 changed files with 288 additions and 19 deletions
@@ -0,0 +1,102 @@
+"""
+Add an IMMUTABLE ``normalize_persian(text)`` SQL function and GIN trigram
+indexes that match it. Lets the search view filter and rank without doing a
+per-row ``translate()`` on a sequential scan — drops query time from seconds
+to tens of milliseconds.
+
+The FROM/TO strings here MUST stay aligned with ``_SQL_NORM_FROM`` /
+``_SQL_NORM_TO`` in ``product/views.py``. If you change one, change the other
+and add a follow-up migration that recreates the function + indexes (Postgres
+matches expression indexes by exact SQL form, so a stale function would
+silently bypass the indexes).
+"""
+from django.db import migrations
+
+
+# Mirror of product.views._SQL_NORM_FROM / _SQL_NORM_TO.
+_SQL_NORM_FROM = (
+    'يك'                  # Arabic ya/kaf -> Persian
+    'ﻱﻲﻳﻴ'                # Arabic ya presentation forms
+    'ﻙﻚﻛﻜ'                # Arabic kaf presentation forms
+    'آأإٱ'                # alef variants
+    'ؤ'                   # waw with hamza
+    'ئ'                   # ya with hamza
+    'ةۀ'                  # ta marbuta / he with hamza
+    'ﻩﻪﻫﻬ'                # he presentation forms
+    '‌‍'        # ZWNJ, ZWJ -> space
+    '۰۱۲۳۴۵۶۷۸۹'          # Persian digits
+    '٠١٢٣٤٥٦٧٨٩'          # Arabic-Indic digits
+    # Deletions (no matching char in TO):
+    'ـ'                   # tatweel
+    '‎‏'        # LRM, RLM
+    'ًٌٍَُِّْ'             # tashkeel
+)
+_SQL_NORM_TO = (
+    'یک'
+    'یییی'
+    'کککک'
+    'اااا'
+    'و'
+    'ی'
+    'هه'
+    'هههه'
+    '  '
+    '0123456789'
+    '0123456789'
+)
+
+
+def _pg_str(s):
+    """Quote a Python string as a PostgreSQL string literal."""
+    return "'" + s.replace("'", "''") + "'"
+
+
+CREATE_FUNCTION_SQL = f"""
+CREATE OR REPLACE FUNCTION normalize_persian(t text) RETURNS text AS $$
+  SELECT lower(translate(t, {_pg_str(_SQL_NORM_FROM)}, {_pg_str(_SQL_NORM_TO)}));
+$$ LANGUAGE SQL IMMUTABLE STRICT PARALLEL SAFE;
+"""
+
+DROP_FUNCTION_SQL = "DROP FUNCTION IF EXISTS normalize_persian(text);"
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("product", "0075_productvariant_guarantee"),
+    ]
+
+    operations = [
+        migrations.RunSQL(
+            sql=CREATE_FUNCTION_SQL,
+            reverse_sql=DROP_FUNCTION_SQL,
+        ),
+        # GIN trigram indexes on the normalized expression. PostgreSQL matches
+        # queries that use exactly ``normalize_persian(<col>)`` against these
+        # indexes, so the views.py wrapper must call the SQL function (not
+        # inline translate/lower) for the index to be used.
+        migrations.RunSQL(
+            sql=(
+                "CREATE INDEX IF NOT EXISTS product_norm_name_trgm_idx "
+                "ON product_productmodel "
+                "USING gin (normalize_persian(name) gin_trgm_ops);"
+            ),
+            reverse_sql="DROP INDEX IF EXISTS product_norm_name_trgm_idx;",
+        ),
+        migrations.RunSQL(
+            sql=(
+                "CREATE INDEX IF NOT EXISTS product_norm_keywords_trgm_idx "
+                "ON product_productmodel "
+                "USING gin (normalize_persian(meta_keywords) gin_trgm_ops);"
+            ),
+            reverse_sql="DROP INDEX IF EXISTS product_norm_keywords_trgm_idx;",
+        ),
+        migrations.RunSQL(
+            sql=(
+                "CREATE INDEX IF NOT EXISTS subcategory_norm_name_trgm_idx "
+                "ON product_subcategorymodel "
+                "USING gin (normalize_persian(name) gin_trgm_ops);"
+            ),
+            reverse_sql="DROP INDEX IF EXISTS subcategory_norm_name_trgm_idx;",
+        ),
+    ]
@@ -1,3 +1,4 @@
+import re
 from .models import ProductModel
 from rest_framework import serializers
 from django.core.paginator import Paginator
@@ -6,8 +7,8 @@ from .models import *
 from .serializers import *
 from rest_framework import status
 from rest_framework.response import Response
-from django.db.models import Q, Value
-from django.db.models.functions import Coalesce
+from django.db.models import Q, Value, Case, When, FloatField, F, CharField, Func
+from django.db.models.functions import Coalesce, Length
 from django.contrib.postgres.search import TrigramSimilarity
 from django.shortcuts import get_object_or_404
 from rest_framework.permissions import IsAuthenticatedOrReadOnly
@@ -21,6 +22,179 @@ from home.models import ShowCaseSlider
 from home.serializers import ShowCaseSliderSerialzier
 from order.models import Cart, CartItem
 from django.db.models import Min, Max, Value
+
+
+_PERSIAN_CHAR_MAP = str.maketrans({
+    # Arabic letters -> Persian equivalents
+    'ي': 'ی', 'ك': 'ک',
+    # Arabic ya/kaf presentation forms -> Persian
+    'ﻱ': 'ی', 'ﻲ': 'ی', 'ﻳ': 'ی', 'ﻴ': 'ی',
+    'ﻙ': 'ک', 'ﻚ': 'ک', 'ﻛ': 'ک', 'ﻜ': 'ک',
+    # Alef variants -> bare alef (so "ایفون" matches "آیفون")
+    'آ': 'ا', 'أ': 'ا', 'إ': 'ا', 'ٱ': 'ا',
+    # Hamza on waw/ya -> bare letter
+    'ؤ': 'و',
+    'ئ': 'ی',
+    # Ta marbuta / he variants -> he
+    'ة': 'ه', 'ۀ': 'ه',
+    'ﻩ': 'ه', 'ﻪ': 'ه', 'ﻫ': 'ه', 'ﻬ': 'ه',
+    # Tatweel - drop
+    'ـ': '',
+    # Tashkeel (diacritics) - drop
+    'ً': '', 'ٌ': '', 'ٍ': '', 'َ': '', 'ُ': '', 'ِ': '', 'ّ': '', 'ْ': '',
+    # Zero-width / direction marks
+    '‌': ' ', '‍': ' ',
+    '‎': '', '‏': '',
+    # Arabic-Indic / Persian digits -> ASCII
+    '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
+    '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9',
+    '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
+    '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
+})
+
+
+def _normalize_search_text(text):
+    """Normalize a search string to handle Persian/Arabic variants, ZWNJ, and case."""
+    if not text:
+        return ''
+    return re.sub(r'\s+', ' ', text.translate(_PERSIAN_CHAR_MAP)).strip().lower()
+
+
+# SQL-side equivalent of _PERSIAN_CHAR_MAP for PostgreSQL translate().
+# Each char at position i in FROM is replaced by char at position i in TO;
+# chars past len(TO) are deleted entirely. This must mirror the Python map so
+# stored values and query strings normalize to the same form.
+_SQL_NORM_FROM = (
+    'يك'                  # Arabic ya/kaf -> Persian
+    'ﻱﻲﻳﻴ'                # Arabic ya presentation forms
+    'ﻙﻚﻛﻜ'                # Arabic kaf presentation forms
+    'آأإٱ'                # alef variants
+    'ؤ'                   # waw with hamza
+    'ئ'                   # ya with hamza
+    'ةۀ'                  # ta marbuta / he with hamza
+    'ﻩﻪﻫﻬ'                # he presentation forms
+    '‌‍'        # ZWNJ, ZWJ -> space
+    '۰۱۲۳۴۵۶۷۸۹'          # Persian digits
+    '٠١٢٣٤٥٦٧٨٩'          # Arabic-Indic digits
+    # Deletions (no matching char in TO):
+    'ـ'                   # tatweel
+    '‎‏'        # LRM, RLM
+    'ًٌٍَُِّْ'             # tashkeel
+)
+_SQL_NORM_TO = (
+    'یک'
+    'یییی'
+    'کککک'
+    'اااا'
+    'و'
+    'ی'
+    'هه'
+    'هههه'
+    '  '
+    '0123456789'
+    '0123456789'
+)
+
+
+def NormalizePersian(expression):
+    """SQL expression that calls the ``normalize_persian(text)`` Postgres function.
+
+    The function (defined in migration 0076) computes ``lower(translate(t, FROM, TO))``
+    and is marked IMMUTABLE so GIN trigram indexes on ``normalize_persian(name)``
+    etc. can be matched by the planner. Calling the function (instead of inlining
+    translate/lower) is what lets queries use those indexes — otherwise every
+    search is a full sequential scan.
+    """
+    return Func(expression, function='normalize_persian', output_field=CharField())
+
+
+def _apply_product_search(queryset, search_query):
+    """Filter and rank a Product queryset by a (possibly Persian) search query.
+
+    Returns (queryset, normalized_query). The queryset is annotated with
+    ``similarity`` so callers can ``order_by('-similarity', ...)``. When no
+    product strictly matches, falls back to a looser similarity-based filter
+    so the user sees suggestions instead of an empty page.
+    """
+    normalized_query = _normalize_search_text(search_query) if search_query else ''
+    if not normalized_query:
+        return queryset, ''
+
+    tokens = [t for t in normalized_query.split(' ') if len(t) >= 2]
+
+    annotated = queryset.annotate(
+        norm_name=NormalizePersian('name'),
+        norm_keywords=NormalizePersian(Coalesce('meta_keywords', Value(''))),
+        norm_category=NormalizePersian(Coalesce('category__name', Value(''))),
+        norm_desc=NormalizePersian(Coalesce('description', Value(''))),
+    ).annotate(
+        name_sim=TrigramSimilarity(F('norm_name'), normalized_query),
+        keywords_sim=TrigramSimilarity(F('norm_keywords'), normalized_query),
+        category_sim=TrigramSimilarity(F('norm_category'), normalized_query),
+        desc_sim=TrigramSimilarity(F('norm_desc'), normalized_query),
+    ).annotate(
+        # Word-boundary aware bonuses. The space-padded variants are what make
+        # "چای" rank above "چایساز" — the former matches "چای " (word boundary)
+        # while the latter only matches the glued prefix.
+        #
+        # Uses case-sensitive lookups (__contains, not __icontains) because both
+        # sides are already lowercased: __icontains would wrap the expression in
+        # UPPER(...) and break the GIN trigram index match.
+        match_bonus=Case(
+            When(norm_name__exact=normalized_query, then=Value(10.0)),
+            When(norm_name__startswith=normalized_query + ' ', then=Value(6.0)),
+            When(norm_name__startswith=normalized_query, then=Value(3.5)),
+            When(norm_name__contains=' ' + normalized_query + ' ', then=Value(3.0)),
+            When(norm_name__contains=' ' + normalized_query, then=Value(2.5)),
+            When(norm_name__contains=normalized_query + ' ', then=Value(2.5)),
+            When(norm_name__contains=normalized_query, then=Value(1.5)),
+            default=Value(0.0),
+            output_field=FloatField(),
+        )
+    ).annotate(
+        similarity=(
+            F('match_bonus')
+            + F('name_sim') * Value(2.0)
+            + F('keywords_sim') * Value(0.8)
+            + F('category_sim') * Value(0.4)
+            + F('desc_sim') * Value(0.15)
+        )
+    )
+
+    if tokens:
+        # Token AND filter. Limited to fields we have GIN trigram indexes for
+        # (name, keywords, category.name in migration 0076) — including
+        # description or slug here would force a sequential scan on the OR
+        # branch and undo the index speedup. Description still contributes via
+        # ``desc_sim`` to ranking on the already-narrowed result set.
+        token_filter = Q()
+        for token in tokens:
+            token_filter &= (
+                Q(norm_name__contains=token)
+                | Q(norm_keywords__contains=token)
+                | Q(norm_category__contains=token)
+            )
+        strict_filter = (
+            token_filter
+            | Q(name_sim__gte=0.45)
+            | Q(keywords_sim__gte=0.5)
+        )
+    else:
+        strict_filter = Q(name_sim__gte=0.4) | Q(keywords_sim__gte=0.4)
+
+    strict_products = annotated.filter(strict_filter).distinct()
+    if strict_products.exists():
+        return strict_products, normalized_query
+
+    # No strict matches — relax thresholds so the user gets "similar"
+    # suggestions instead of an empty result page.
+    loose_filter = (
+        Q(name_sim__gte=0.18)
+        | Q(keywords_sim__gte=0.22)
+        | Q(category_sim__gte=0.3)
+        | Q(match_bonus__gt=0)
+    )
+    return annotated.filter(loose_filter).distinct(), normalized_query
 # class APIView(APIView):
 #     def __init__(self, *args, **kwargs):
 #         super().__init__(*args, **kwargs)
@@ -324,18 +498,9 @@ class AllProductsView(APIView):
                        status=status.HTTP_400_BAD_REQUEST
                    )

-            # Search
+            # Search (Persian-aware, with typo tolerance + similar-results fallback)
            search_query = request.query_params.get('search')
-            if search_query:
-                products = products.annotate(
-                    similarity=(
-                        TrigramSimilarity('name', search_query) +
-                        TrigramSimilarity(
-                            Coalesce('description', Value('')),
-                            search_query
-                        )
-                    )
-                ).filter(similarity__gt=0.1)
+            products, normalized_query = _apply_product_search(products, search_query)

            # Price annotation (IMPORTANT for sorting)
            products = products.annotate(
@@ -376,8 +541,10 @@ class AllProductsView(APIView):

            elif sort_by in ['price', '-price']:
                products = products.order_by('min_price' if sort_by == 'price' else '-min_price')
-            elif search_query:
-                products = products.order_by('-similarity', 'name')
+            elif normalized_query:
+                # Tie-break on shorter name: ensures "چای" outranks "چای ساز"
+                # when their bonus-adjusted similarities are close.
+                products = products.order_by('-similarity', Length('norm_name'), 'name')
            else:
                products = products.order_by('name')

@@ -522,11 +689,9 @@ class ShowCaseProductsView(APIView):
            if has_discount:
                products = products.filter(variants__discount__gt=0).distinct()

-            # Search filter
+            # Search filter (Persian-aware, with typo tolerance + similar-results fallback)
            search_query = request.query_params.get('search', None)
-            if search_query:
-                products = products.filter(Q(name__icontains=search_query) | Q(
-                    description__icontains=search_query))
+            products, normalized_query = _apply_product_search(products, search_query)

            # Price filters
            price_gte = request.query_params.get('price_gte', None)
@@ -543,6 +708,8 @@ class ShowCaseProductsView(APIView):
            sort_by = request.query_params.get('sort', None)
            if sort_by in ['name', '-name', 'created_at', '-created_at']:
                products = products.order_by(sort_by)
+            elif normalized_query:
+                products = products.order_by('-similarity', Length('norm_name'), 'name')
            else:
                products = products.order_by('name')