autocomplete.py:

添加搜索联想功能模块,但未加入search接口
all_kana.py:
将日语单词转换功能单独设立为功能模块
This commit is contained in:
Miyamizu-MitsuhaSang 2025-09-01 15:35:39 +08:00
parent f675e5ce95
commit dac3926978
4 changed files with 152 additions and 33 deletions

View File

@ -7,45 +7,13 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from app.models import DefinitionJp
from app.models.fr import DefinitionFr
from app.schemas.search_schemas import SearchRequest, SearchResponse, SearchItemFr, SearchItemJp
from app.utils.all_kana import all_in_kana
from app.utils.security import get_current_user
from app.utils.textnorm import normalize_text
from scripts.update_jp import normalize_jp_text
dict_search = APIRouter()
kks = pykakasi.kakasi()
kks.setMode("H", "a") # 平假名 -> ascii (罗马字)
kks.setMode("K", "a") # 片假名 -> ascii
kks.setMode("J", "a") # 汉字 -> ascii
kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字
conv = kks.getConverter()
def all_in_kana(text: str) -> str:
"""
将输入统一转换为平假名支持
- 平假名
- 片假名
- 罗马字 (Hepburn 转写)
返回平假名字符串
"""
if not text:
return ""
# 1. 片假名 → 平假名
normalized = jaconv.kata2hira(text)
# 2. 如果里面含有罗马字字符,就先转成假名
if any("a" <= ch.lower() <= "z" for ch in normalized):
hira = conv.do(normalized) # 罗马字 -> 平假名
normalized = jaconv.kata2hira(hira)
# 3. 再次片假名 -> 平假名保险
normalized = jaconv.kata2hira(normalized)
return normalized
@dict_search.post("/search", response_model=SearchResponse)
async def search(request: Request, body: SearchRequest, user=Depends(get_current_user)):
@ -107,5 +75,13 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current
contents=contents,
)
# TODO 相关度排序(转换为模糊匹配)
# TODO 输入搜索框时反馈内容
# @dict_search.post("search/list")
# async def search_list(body: SearchRequest, user=Depends(get_current_user)):
# query = body.query
# if body.language == 'fr':
# query = normalize_text(query)
# prefix = await DefinitionFr.filter(word__text__icontains=query)

35
app/utils/all_kana.py Normal file
View File

@ -0,0 +1,35 @@
import jaconv
import pykakasi
kks = pykakasi.kakasi()
kks.setMode("H", "a") # 平假名 -> ascii (罗马字)
kks.setMode("K", "a") # 片假名 -> ascii
kks.setMode("J", "a") # 汉字 -> ascii
kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字
conv = kks.getConverter()
def all_in_kana(text: str) -> str:
"""
将输入统一转换为平假名支持
- 平假名
- 片假名
- 罗马字 (Hepburn 转写)
返回平假名字符串
"""
if not text:
return ""
# 1. 片假名 → 平假名
normalized = jaconv.kata2hira(text)
# 2. 如果里面含有罗马字字符,就先转成假名
if any("a" <= ch.lower() <= "z" for ch in normalized):
hira = conv.do(normalized) # 罗马字 -> 平假名
normalized = jaconv.kata2hira(hira)
# 3. 再次片假名 -> 平假名保险
normalized = jaconv.kata2hira(normalized)
return normalized

107
app/utils/autocomplete.py Normal file
View File

@ -0,0 +1,107 @@
import asyncio
from tortoise import Tortoise
from tortoise.expressions import Q
from typing import List, Literal, Tuple
from app.models import WordlistFr, WordlistJp
from app.schemas.search_schemas import SearchRequest
from app.utils.all_kana import all_in_kana
from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM
async def suggest_autocomplete(query: SearchRequest, limit: int = 10) -> List[str]:
"""
:param query: 当前用户输入的内容
:param limit: 返回列表限制长度
:return: 联想的单词列表非完整信息单纯单词
"""
if query.language == 'fr':
query_word = normalize_text(query.query)
exact = await (
WordlistFr
.get_or_none(text=query.query)
.values("text", "freq")
)
exact_word = [(exact.get("text"), exact.get("freq"))]
qs_prefix = (
WordlistFr
.filter(Q(search_text__startswith=query_word) | Q(text__startswith=query.query))
.exclude(text=query.query)
.only("text", "freq")
)
prefix_objs = await qs_prefix[:limit]
prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs]
need = max(0, limit - len(prefix))
contains: List[Tuple[str, int]] = []
if need > 0:
qs_contain = (
WordlistFr
.filter(Q(search_text__icontains=query_word) | Q(text__icontains=query.query))
.exclude(Q(search_text__startswith=query_word) | Q(text__startswith=query.query) | Q(text=query.query))
.only("text", "freq")
.only("text", "freq")
)
contains_objs = await qs_contain[: need * 2]
contains = [(o.text, o.freq) for o in contains_objs]
else:
query_word = all_in_kana(query.query)
exact = await (
WordlistJp
.get_or_none(text=query.query)
.only("text", "freq")
)
exact_word = [(exact.text, exact.freq)]
qs_prefix = (
WordlistJp
.filter(Q(hiragana__startswith=query_word) | Q(text__startswith=query.query))
.exclude(text=query.query)
.only("text", "freq")
)
prefix_objs = await qs_prefix[:limit]
prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs]
need = max(0, limit - len(prefix))
contains: List[Tuple[str, int]] = []
if need > 0:
qs_contain = await (
WordlistJp
.filter(Q(hiragana__icontains=query_word) | Q(text__icontains=query.query))
.exclude(Q(hiragana__startswith=query_word) | Q(text__startswith=query.query) | Q(text=query.query))
.only("text", "freq")
.only("text", "freq")
)
contains_objs = qs_contain[:need * 2]
contains: List[Tuple[str, int]] = [(o.text, o.freq) for o in contains_objs]
seen_text, out = set(), []
for text, freq in list(exact_word) + list(prefix) + list(contains):
if text not in seen_text:
seen_text.add(text)
out.append((text, freq))
if len(out) >= limit:
break
out = sorted(out, key=lambda w: (w[1], len(w[0]), w[0]))
return [text for text, _ in out]
async def __test():
query_word: str = ''
language: Literal['fr', 'jp'] = 'jp'
return await suggest_autocomplete(SearchRequest(query=query_word, language=language))
async def __main():
await Tortoise.init(config=TORTOISE_ORM)
print(await __test())
if __name__ == '__main__':
asyncio.run(__main())

View File

@ -1,3 +1,4 @@
jaconv
fugashi
pykakasi
aerich==0.9.1