ypdate search

2026-05-30 09:06:25 +03:30
parent e6965fe3b8
commit 6f2037309c
2 changed files with 288 additions and 19 deletions
@@ -0,0 +1,102 @@
 """
 Add an IMMUTABLE ``normalize_persian(text)`` SQL function and GIN trigram
 indexes that match it. Lets the search view filter and rank without doing a
 per-row ``translate()`` on a sequential scan — drops query time from seconds
 to tens of milliseconds.
 The FROM/TO strings here MUST stay aligned with ``_SQL_NORM_FROM`` /
 ``_SQL_NORM_TO`` in ``product/views.py``. If you change one, change the other
 and add a follow-up migration that recreates the function + indexes (Postgres
 matches expression indexes by exact SQL form, so a stale function would
 silently bypass the indexes).
 """
 from django.db import migrations
 # Mirror of product.views._SQL_NORM_FROM / _SQL_NORM_TO.
 _SQL_NORM_FROM = (
    'يك'                  # Arabic ya/kaf -> Persian
    'ﻱﻲﻳﻴ'                # Arabic ya presentation forms
    'ﻙﻚﻛﻜ'                # Arabic kaf presentation forms
    'آأإٱ'                # alef variants
    'ؤ'                   # waw with hamza
    'ئ'                   # ya with hamza
    'ةۀ'                  # ta marbuta / he with hamza
    'ﻩﻪﻫﻬ'                # he presentation forms
    '‌‍'        # ZWNJ, ZWJ -> space
    '۰۱۲۳۴۵۶۷۸۹'          # Persian digits
    '٠١٢٣٤٥٦٧٨٩'          # Arabic-Indic digits
    # Deletions (no matching char in TO):
    'ـ'                   # tatweel
    '‎‏'        # LRM, RLM
    'ًٌٍَُِّْ'             # tashkeel
 )
 _SQL_NORM_TO = (
    'یک'
    'یییی'
    'کککک'
    'اااا'
    'و'
    'ی'
    'هه'
    'هههه'
    '  '
    '0123456789'
    '0123456789'
 )
 def _pg_str(s):
    """Quote a Python string as a PostgreSQL string literal."""
    return "'" + s.replace("'", "''") + "'"
 CREATE_FUNCTION_SQL = f"""
 CREATE OR REPLACE FUNCTION normalize_persian(t text) RETURNS text AS $$
  SELECT lower(translate(t, {_pg_str(_SQL_NORM_FROM)}, {_pg_str(_SQL_NORM_TO)}));
 $$ LANGUAGE SQL IMMUTABLE STRICT PARALLEL SAFE;
 """
 DROP_FUNCTION_SQL = "DROP FUNCTION IF EXISTS normalize_persian(text);"
 class Migration(migrations.Migration):
    dependencies = [
        ("product", "0075_productvariant_guarantee"),
    ]
    operations = [
        migrations.RunSQL(
            sql=CREATE_FUNCTION_SQL,
            reverse_sql=DROP_FUNCTION_SQL,
        ),
        # GIN trigram indexes on the normalized expression. PostgreSQL matches
        # queries that use exactly ``normalize_persian(<col>)`` against these
        # indexes, so the views.py wrapper must call the SQL function (not
        # inline translate/lower) for the index to be used.
        migrations.RunSQL(
            sql=(
                "CREATE INDEX IF NOT EXISTS product_norm_name_trgm_idx "
                "ON product_productmodel "
                "USING gin (normalize_persian(name) gin_trgm_ops);"
            ),
            reverse_sql="DROP INDEX IF EXISTS product_norm_name_trgm_idx;",
        ),
        migrations.RunSQL(
            sql=(
                "CREATE INDEX IF NOT EXISTS product_norm_keywords_trgm_idx "
                "ON product_productmodel "
                "USING gin (normalize_persian(meta_keywords) gin_trgm_ops);"
            ),
            reverse_sql="DROP INDEX IF EXISTS product_norm_keywords_trgm_idx;",
        ),
        migrations.RunSQL(
            sql=(
                "CREATE INDEX IF NOT EXISTS subcategory_norm_name_trgm_idx "
                "ON product_subcategorymodel "
                "USING gin (normalize_persian(name) gin_trgm_ops);"
            ),
            reverse_sql="DROP INDEX IF EXISTS subcategory_norm_name_trgm_idx;",
        ),
    ]
@@ -1,3 +1,4 @@
 import re
 from .models import ProductModel
 from rest_framework import serializers
 from django.core.paginator import Paginator
@@ -6,8 +7,8 @@ from .models import *
 from .serializers import *
 from rest_framework import status
 from rest_framework.response import Response
-from django.db.models import Q, Value
+from django.db.models import Q, Value, Case, When, FloatField, F, CharField, Func
-from django.db.models.functions import Coalesce
+from django.db.models.functions import Coalesce, Length
 from django.contrib.postgres.search import TrigramSimilarity
 from django.shortcuts import get_object_or_404
 from rest_framework.permissions import IsAuthenticatedOrReadOnly
@@ -21,6 +22,179 @@ from home.models import ShowCaseSlider
 from home.serializers import ShowCaseSliderSerialzier
 from order.models import Cart, CartItem
 from django.db.models import Min, Max, Value
 _PERSIAN_CHAR_MAP = str.maketrans({
    # Arabic letters -> Persian equivalents
    'ي': 'ی', 'ك': 'ک',
    # Arabic ya/kaf presentation forms -> Persian
    'ﻱ': 'ی', 'ﻲ': 'ی', 'ﻳ': 'ی', 'ﻴ': 'ی',
    'ﻙ': 'ک', 'ﻚ': 'ک', 'ﻛ': 'ک', 'ﻜ': 'ک',
    # Alef variants -> bare alef (so "ایفون" matches "آیفون")
    'آ': 'ا', 'أ': 'ا', 'إ': 'ا', 'ٱ': 'ا',
    # Hamza on waw/ya -> bare letter
    'ؤ': 'و',
    'ئ': 'ی',
    # Ta marbuta / he variants -> he
    'ة': 'ه', 'ۀ': 'ه',
    'ﻩ': 'ه', 'ﻪ': 'ه', 'ﻫ': 'ه', 'ﻬ': 'ه',
    # Tatweel - drop
    'ـ': '',
    # Tashkeel (diacritics) - drop
    'ً': '', 'ٌ': '', 'ٍ': '', 'َ': '', 'ُ': '', 'ِ': '', 'ّ': '', 'ْ': '',
    # Zero-width / direction marks
    '‌': ' ', '‍': ' ',
    '‎': '', '‏': '',
    # Arabic-Indic / Persian digits -> ASCII
    '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
    '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9',
    '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
    '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
 })
 def _normalize_search_text(text):
    """Normalize a search string to handle Persian/Arabic variants, ZWNJ, and case."""
    if not text:
        return ''
    return re.sub(r'\s+', ' ', text.translate(_PERSIAN_CHAR_MAP)).strip().lower()
 # SQL-side equivalent of _PERSIAN_CHAR_MAP for PostgreSQL translate().
 # Each char at position i in FROM is replaced by char at position i in TO;
 # chars past len(TO) are deleted entirely. This must mirror the Python map so
 # stored values and query strings normalize to the same form.
 _SQL_NORM_FROM = (
    'يك'                  # Arabic ya/kaf -> Persian
    'ﻱﻲﻳﻴ'                # Arabic ya presentation forms
    'ﻙﻚﻛﻜ'                # Arabic kaf presentation forms
    'آأإٱ'                # alef variants
    'ؤ'                   # waw with hamza
    'ئ'                   # ya with hamza
    'ةۀ'                  # ta marbuta / he with hamza
    'ﻩﻪﻫﻬ'                # he presentation forms
    '‌‍'        # ZWNJ, ZWJ -> space
    '۰۱۲۳۴۵۶۷۸۹'          # Persian digits
    '٠١٢٣٤٥٦٧٨٩'          # Arabic-Indic digits
    # Deletions (no matching char in TO):
    'ـ'                   # tatweel
    '‎‏'        # LRM, RLM
    'ًٌٍَُِّْ'             # tashkeel
 )
 _SQL_NORM_TO = (
    'یک'
    'یییی'
    'کککک'
    'اااا'
    'و'
    'ی'
    'هه'
    'هههه'
    '  '
    '0123456789'
    '0123456789'
 )
 def NormalizePersian(expression):
    """SQL expression that calls the ``normalize_persian(text)`` Postgres function.
    The function (defined in migration 0076) computes ``lower(translate(t, FROM, TO))``
    and is marked IMMUTABLE so GIN trigram indexes on ``normalize_persian(name)``
    etc. can be matched by the planner. Calling the function (instead of inlining
    translate/lower) is what lets queries use those indexes — otherwise every
    search is a full sequential scan.
    """
    return Func(expression, function='normalize_persian', output_field=CharField())
 def _apply_product_search(queryset, search_query):
    """Filter and rank a Product queryset by a (possibly Persian) search query.
    Returns (queryset, normalized_query). The queryset is annotated with
    ``similarity`` so callers can ``order_by('-similarity', ...)``. When no
    product strictly matches, falls back to a looser similarity-based filter
    so the user sees suggestions instead of an empty page.
    """
    normalized_query = _normalize_search_text(search_query) if search_query else ''
    if not normalized_query:
        return queryset, ''
    tokens = [t for t in normalized_query.split(' ') if len(t) >= 2]
    annotated = queryset.annotate(
        norm_name=NormalizePersian('name'),
        norm_keywords=NormalizePersian(Coalesce('meta_keywords', Value(''))),
        norm_category=NormalizePersian(Coalesce('category__name', Value(''))),
        norm_desc=NormalizePersian(Coalesce('description', Value(''))),
    ).annotate(
        name_sim=TrigramSimilarity(F('norm_name'), normalized_query),
        keywords_sim=TrigramSimilarity(F('norm_keywords'), normalized_query),
        category_sim=TrigramSimilarity(F('norm_category'), normalized_query),
        desc_sim=TrigramSimilarity(F('norm_desc'), normalized_query),
    ).annotate(
        # Word-boundary aware bonuses. The space-padded variants are what make
        # "چای" rank above "چایساز" — the former matches "چای " (word boundary)
        # while the latter only matches the glued prefix.
        #
        # Uses case-sensitive lookups (__contains, not __icontains) because both
        # sides are already lowercased: __icontains would wrap the expression in
        # UPPER(...) and break the GIN trigram index match.
        match_bonus=Case(
            When(norm_name__exact=normalized_query, then=Value(10.0)),
            When(norm_name__startswith=normalized_query + ' ', then=Value(6.0)),
            When(norm_name__startswith=normalized_query, then=Value(3.5)),
            When(norm_name__contains=' ' + normalized_query + ' ', then=Value(3.0)),
            When(norm_name__contains=' ' + normalized_query, then=Value(2.5)),
            When(norm_name__contains=normalized_query + ' ', then=Value(2.5)),
            When(norm_name__contains=normalized_query, then=Value(1.5)),
            default=Value(0.0),
            output_field=FloatField(),
        )
    ).annotate(
        similarity=(
            F('match_bonus')
            + F('name_sim') * Value(2.0)
            + F('keywords_sim') * Value(0.8)
            + F('category_sim') * Value(0.4)
            + F('desc_sim') * Value(0.15)
        )
    )
    if tokens:
        # Token AND filter. Limited to fields we have GIN trigram indexes for
        # (name, keywords, category.name in migration 0076) — including
        # description or slug here would force a sequential scan on the OR
        # branch and undo the index speedup. Description still contributes via
        # ``desc_sim`` to ranking on the already-narrowed result set.
        token_filter = Q()
        for token in tokens:
            token_filter &= (
                Q(norm_name__contains=token)
                | Q(norm_keywords__contains=token)
                | Q(norm_category__contains=token)
            )
        strict_filter = (
            token_filter
            | Q(name_sim__gte=0.45)
            | Q(keywords_sim__gte=0.5)
        )
    else:
        strict_filter = Q(name_sim__gte=0.4) | Q(keywords_sim__gte=0.4)
    strict_products = annotated.filter(strict_filter).distinct()
    if strict_products.exists():
        return strict_products, normalized_query
    # No strict matches — relax thresholds so the user gets "similar"
    # suggestions instead of an empty result page.
    loose_filter = (
        Q(name_sim__gte=0.18)
        | Q(keywords_sim__gte=0.22)
        | Q(category_sim__gte=0.3)
        | Q(match_bonus__gt=0)
    )
    return annotated.filter(loose_filter).distinct(), normalized_query
 # class APIView(APIView):
 #     def __init__(self, *args, **kwargs):
 #         super().__init__(*args, **kwargs)
@@ -324,18 +498,9 @@ class AllProductsView(APIView):
                        status=status.HTTP_400_BAD_REQUEST
                    )
-            # Search
+            # Search (Persian-aware, with typo tolerance + similar-results fallback)
            search_query = request.query_params.get('search')
-            if search_query:
+            products, normalized_query = _apply_product_search(products, search_query)
                products = products.annotate(
                    similarity=(
                        TrigramSimilarity('name', search_query) +
                        TrigramSimilarity(
                            Coalesce('description', Value('')),
                            search_query
                        )
                    )
                ).filter(similarity__gt=0.1)
            # Price annotation (IMPORTANT for sorting)
            products = products.annotate(
@@ -376,8 +541,10 @@ class AllProductsView(APIView):
            elif sort_by in ['price', '-price']:
                products = products.order_by('min_price' if sort_by == 'price' else '-min_price')
-            elif search_query:
+            elif normalized_query:
-                products = products.order_by('-similarity', 'name')
+                # Tie-break on shorter name: ensures "چای" outranks "چای ساز"
                # when their bonus-adjusted similarities are close.
                products = products.order_by('-similarity', Length('norm_name'), 'name')
            else:
                products = products.order_by('name')
@@ -522,11 +689,9 @@ class ShowCaseProductsView(APIView):
            if has_discount:
                products = products.filter(variants__discount__gt=0).distinct()
-            # Search filter
+            # Search filter (Persian-aware, with typo tolerance + similar-results fallback)
            search_query = request.query_params.get('search', None)
-            if search_query:
+            products, normalized_query = _apply_product_search(products, search_query)
                products = products.filter(Q(name__icontains=search_query) | Q(
                    description__icontains=search_query))
            # Price filters
            price_gte = request.query_params.get('price_gte', None)
@@ -543,6 +708,8 @@ class ShowCaseProductsView(APIView):
            sort_by = request.query_params.get('sort', None)
            if sort_by in ['name', '-name', 'created_at', '-created_at']:
                products = products.order_by(sort_by)
            elif normalized_query:
                products = products.order_by('-similarity', Length('norm_name'), 'name')
            else:
                products = products.order_by('name')