From af254cecfbeac63b32cb68a5a3735ddff2e474cc Mon Sep 17 00:00:00 2001 From: Miyamizu-MitsuhaSang <2510681107@qq.com> Date: Mon, 1 Sep 2025 15:35:39 +0800 Subject: [PATCH] =?UTF-8?q?autocomplete.py:=20=E6=B7=BB=E5=8A=A0=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E8=81=94=E6=83=B3=E5=8A=9F=E8=83=BD=E6=A8=A1=E5=9D=97?= =?UTF-8?q?=EF=BC=8C=E4=BD=86=E6=9C=AA=E5=8A=A0=E5=85=A5search=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=20all=5Fkana.py:=20=E5=B0=86=E6=97=A5=E8=AF=AD?= =?UTF-8?q?=E5=8D=95=E8=AF=8D=E8=BD=AC=E6=8D=A2=E5=8A=9F=E8=83=BD=E5=8D=95?= =?UTF-8?q?=E7=8B=AC=E8=AE=BE=E7=AB=8B=E4=B8=BA=E5=8A=9F=E8=83=BD=E6=A8=A1?= =?UTF-8?q?=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/api/search.py | 42 ++++-------------- app/utils/all_kana.py | 35 +++++++++++++++ app/utils/autocomplete.py | 91 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 136 insertions(+), 33 deletions(-) create mode 100644 app/utils/all_kana.py create mode 100644 app/utils/autocomplete.py diff --git a/app/api/search.py b/app/api/search.py index 3780fe5..05e4895 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -7,45 +7,13 @@ from fastapi import APIRouter, Depends, HTTPException, Request from app.models import DefinitionJp from app.models.fr import DefinitionFr from app.schemas.search_schemas import SearchRequest, SearchResponse, SearchItemFr, SearchItemJp +from app.utils.all_kana import all_in_kana from app.utils.security import get_current_user from app.utils.textnorm import normalize_text from scripts.update_jp import normalize_jp_text dict_search = APIRouter() -kks = pykakasi.kakasi() -kks.setMode("H", "a") # 平假名 -> ascii (罗马字) -kks.setMode("K", "a") # 片假名 -> ascii -kks.setMode("J", "a") # 汉字 -> ascii -kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字 -conv = kks.getConverter() - - -def all_in_kana(text: str) -> str: - """ - 将输入统一转换为平假名,支持: - - 平假名 - - 片假名 - - 罗马字 (Hepburn 转写) - - 返回:平假名字符串 - """ - if not text: - return "" - - # 1. 片假名 → 平假名 - normalized = jaconv.kata2hira(text) - - # 2. 如果里面含有罗马字字符,就先转成假名 - if any("a" <= ch.lower() <= "z" for ch in normalized): - hira = conv.do(normalized) # 罗马字 -> 平假名 - normalized = jaconv.kata2hira(hira) - - # 3. 再次片假名 -> 平假名保险 - normalized = jaconv.kata2hira(normalized) - - return normalized - @dict_search.post("/search", response_model=SearchResponse) async def search(request: Request, body: SearchRequest, user=Depends(get_current_user)): @@ -107,5 +75,13 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current contents=contents, ) + # TODO 相关度排序(转换为模糊匹配) # TODO 输入搜索框时反馈内容 + +# @dict_search.post("search/list") +# async def search_list(body: SearchRequest, user=Depends(get_current_user)): +# query = body.query +# if body.language == 'fr': +# query = normalize_text(query) +# prefix = await DefinitionFr.filter(word__text__icontains=query) diff --git a/app/utils/all_kana.py b/app/utils/all_kana.py new file mode 100644 index 0000000..293f0e4 --- /dev/null +++ b/app/utils/all_kana.py @@ -0,0 +1,35 @@ +import jaconv +import pykakasi + +kks = pykakasi.kakasi() +kks.setMode("H", "a") # 平假名 -> ascii (罗马字) +kks.setMode("K", "a") # 片假名 -> ascii +kks.setMode("J", "a") # 汉字 -> ascii +kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字 +conv = kks.getConverter() + + +def all_in_kana(text: str) -> str: + """ + 将输入统一转换为平假名,支持: + - 平假名 + - 片假名 + - 罗马字 (Hepburn 转写) + + 返回:平假名字符串 + """ + if not text: + return "" + + # 1. 片假名 → 平假名 + normalized = jaconv.kata2hira(text) + + # 2. 如果里面含有罗马字字符,就先转成假名 + if any("a" <= ch.lower() <= "z" for ch in normalized): + hira = conv.do(normalized) # 罗马字 -> 平假名 + normalized = jaconv.kata2hira(hira) + + # 3. 再次片假名 -> 平假名保险 + normalized = jaconv.kata2hira(normalized) + + return normalized \ No newline at end of file diff --git a/app/utils/autocomplete.py b/app/utils/autocomplete.py new file mode 100644 index 0000000..b6fb6ea --- /dev/null +++ b/app/utils/autocomplete.py @@ -0,0 +1,91 @@ +import asyncio + +from tortoise import Tortoise +from tortoise.expressions import Q +from typing import List, Literal, Tuple + +from app.models import WordlistFr, WordlistJp +from app.schemas.search_schemas import SearchRequest +from app.utils.all_kana import all_in_kana +from app.utils.textnorm import normalize_text +from settings import TORTOISE_ORM + + +async def suggest_autocomplete(query: SearchRequest, limit: int = 10) -> List[str]: + """ + + :param query: 当前用户输入的内容 + :param limit: 返回列表限制长度 + :return: 联想的单词列表(非完整信息,单纯单词) + """ + if query.language == 'fr': + query_word = normalize_text(query.query) + qs_prefix = await ( + WordlistFr + .filter(search_text__startswith=query_word) + .only("text", "freq") + ) + prefix_objs = qs_prefix[:limit] + prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs] + + need = max(0, limit - len(prefix)) + contains: List[Tuple[str, int]] = [] + + if need > 0: + qs_contain = await ( + WordlistFr + .filter(search_text__icontains=query_word) + .exclude(search_text__startswith=query_word) + .only("text", "freq") + ) + contains_objs = qs_contain[:need * 2] + contains: List[Tuple[str, int]] = [(o.text, o.freq) for o in contains_objs] + + else: + query_word = all_in_kana(query.query) + + qs_prefix = await ( + WordlistJp + .filter(hiragana__startswith=query_word) + .only("text", "freq") + ) + + prefix_objs = qs_prefix[:limit] + prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs] + + need = max(0, limit - len(prefix)) + contains = [] + if need > 0: + qs_contain = await ( + WordlistJp + .filter(Q(hiragana__icontains=query_word) | Q(text__icontains=query_word)) + .exclude(Q(hiragana__startswith=query_word) | Q(text__startswith=query_word)) + .only("text", "freq") + ) + contains_objs = qs_contain[:need * 2] + contains: List[Tuple[str, int]] = [(o.text, o.freq) for o in contains_objs] + + seen_text, out = set(), [] + for word in list(qs_prefix) + list(contains): + if word.text not in seen_text: + seen_text.add(word.text) + out.append((word.text, word.freq)) + if len(out) >= limit: + break + out = sorted(out, key=lambda w: (w[1], len(w[0]), w[0])) + return [text for text, _ in out] + + +async def __test(): + query_word: str = 'あい' + language: Literal['fr', 'jp'] = 'jp' + return await suggest_autocomplete(SearchRequest(query=query_word, language=language)) + + +async def __main(): + await Tortoise.init(config=TORTOISE_ORM) + print(await __test()) + + +if __name__ == '__main__': + asyncio.run(__main()) diff --git a/requirements.txt b/requirements.txt index de7e289..dac9bda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +jaconv fugashi pykakasi aerich==0.9.1