dict-server/app/api/search_dict/service.py

209 lines
6.8 KiB
Python

import asyncio
import re
from typing import List, Tuple, Dict, Literal
from fastapi import HTTPException
from tortoise import Tortoise
from tortoise.expressions import Q
from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest
from app.models import WordlistFr, WordlistJp
from app.models.fr import ProverbFr
from app.utils.all_kana import all_in_kana
from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM
def contains_chinese(text: str) -> bool:
"""判断字符串中是否包含至少一个中文字符"""
return bool(re.search(r'[\u4e00-\u9fff]', text))
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
proverb = await ProverbFr.get_or_none(id=proverb_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
return ProverbSearchResponse(
proverb_text=proverb.proverb,
chi_exp=proverb.chi_exp,
)
async def suggest_proverb(query: ProverbSearchRequest, lang: Literal['fr', 'zh']) -> List[Dict[str, str]]:
"""
对法语谚语表进行搜索建议。
参数:
query.query: 搜索关键词
lang: 'fr''zh'
逻辑:
1. 若 lang='fr',按谚语字段 (proverb) 搜索;
2. 若 lang='zh',按中文释义字段 (chi_exp) 搜索;
3. 优先以输入开头的匹配;
4. 其次为包含输入但不以其开头的匹配(按 freq 排序)。
:return: [{'id': 1, 'proverb': 'xxx'}, ...]
"""
keyword = query.query.strip()
results: List[Dict[str, str]] = []
if not keyword:
return results
# ✅ 根据语言决定搜索字段
if lang == "zh":
startswith_field = "chi_exp__istartswith"
contains_field = "chi_exp__icontains"
else: # 默认法语
startswith_field = "proverb__istartswith"
contains_field = "proverb__icontains"
# ✅ 1. 开头匹配
start_matches = await (
ProverbFr.filter(**{startswith_field: keyword})
.order_by("-freq")
.limit(10)
.values("id", "proverb", "chi_exp")
)
# ✅ 2. 包含匹配(但不是开头)
contain_matches = await (
ProverbFr.filter(
Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword})
)
.order_by("-freq")
.limit(10)
.values("id", "proverb", "chi_exp")
)
# ✅ 合并结果(去重并保持顺序)
seen_ids = set()
for row in start_matches + contain_matches:
if row["id"] not in seen_ids:
seen_ids.add(row["id"])
results.append({
"id": row["id"],
"proverb": row["proverb"],
"chi_exp": row["chi_exp"]
})
return results
async def suggest_autocomplete(query: SearchRequest, limit: int = 10):
"""
:param query: 当前用户输入的内容
:param limit: 返回列表限制长度
:return: 联想的单词列表(非完整信息,单纯单词)
"""
if query.language == 'fr':
query_word = normalize_text(query.query)
exact = await (
WordlistFr
.get_or_none(search_text=query.query)
.values("text", "freq")
)
if exact:
exact_word = [(exact.get("text"), exact.get("freq"))]
else:
exact_word = []
qs_prefix = (
WordlistFr
.filter(Q(search_text__startswith=query_word) | Q(text__startswith=query.query))
.exclude(search_text=query.query)
.only("text", "freq")
)
prefix_objs = await qs_prefix[:limit]
prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs]
need = max(0, limit - len(prefix))
contains: List[Tuple[str, int]] = []
if need > 0:
qs_contain = (
WordlistFr
.filter(Q(search_text__icontains=query_word) | Q(text__icontains=query.query))
.exclude(Q(search_text__startswith=query_word) | Q(text__startswith=query.query) | Q(text=query.query))
.only("text", "freq")
.only("text", "freq")
)
contains_objs = await qs_contain[: need * 2]
contains = [(o.text, o.freq) for o in contains_objs]
seen_text, out = set(), []
for text, freq in list(exact_word) + list(prefix) + list(contains):
key = text
if key not in seen_text:
seen_text.add(key)
out.append((text, freq))
if len(out) >= limit:
break
out = sorted(out, key=lambda w: (-w[2], len(w[0]), w[0]))
return [text for text, _ in out]
else:
query_word = all_in_kana(query.query)
exact = await (
WordlistJp
.get_or_none(
text=query.query
)
.only("text", "hiragana", "freq")
)
if exact:
exact_word = [(exact.text, exact.hiragana, exact.freq)]
else:
exact_word = []
qs_prefix = (
WordlistJp
.filter(Q(hiragana__startswith=query_word) | Q(text__startswith=query.query))
.exclude(text=query.query)
.only("text", "hiragana", "freq")
)
prefix_objs = await qs_prefix[:limit]
prefix: List[Tuple[str, str, int]] = [(o.text, o.hiragana, o.freq) for o in prefix_objs]
need = max(0, limit - len(prefix))
contains: List[Tuple[str, str, int]] = []
if need > 0:
qs_contain = await (
WordlistJp
.filter(Q(hiragana__icontains=query_word) | Q(text__icontains=query.query))
.exclude(Q(hiragana__startswith=query_word) | Q(text__startswith=query.query) | Q(text=query.query))
.only("text", "hiragana", "freq")
)
contains_objs = qs_contain[:need * 2]
contains: List[Tuple[str, str, int]] = [(o.text, o.hiragana, o.freq) for o in contains_objs]
seen_text, out = set(), []
for text, hiragana, freq in list(exact_word) + list(prefix) + list(contains):
key = (text, hiragana)
if key not in seen_text:
seen_text.add(key)
out.append((text, hiragana, freq))
if len(out) >= limit:
break
out = sorted(out, key=lambda w: (-w[2], len(w[0]), w[0]))
return [(text, hiragana) for text, hiragana, _ in out]
async def __test():
query_word: str = '棋逢'
return await (
suggest_proverb(
query=ProverbSearchRequest(query=query_word),
lang='zh'
)
)
async def __main():
await Tortoise.init(config=TORTOISE_ORM)
print(await __test())
if __name__ == '__main__':
asyncio.run(__main())