autocomplete.py:
添加搜索联想功能模块,但未加入search接口 all_kana.py: 将日语单词转换功能单独设立为功能模块
This commit is contained in:
parent
f675e5ce95
commit
af254cecfb
|
|
@ -7,45 +7,13 @@ from fastapi import APIRouter, Depends, HTTPException, Request
|
|||
from app.models import DefinitionJp
|
||||
from app.models.fr import DefinitionFr
|
||||
from app.schemas.search_schemas import SearchRequest, SearchResponse, SearchItemFr, SearchItemJp
|
||||
from app.utils.all_kana import all_in_kana
|
||||
from app.utils.security import get_current_user
|
||||
from app.utils.textnorm import normalize_text
|
||||
from scripts.update_jp import normalize_jp_text
|
||||
|
||||
dict_search = APIRouter()
|
||||
|
||||
kks = pykakasi.kakasi()
|
||||
kks.setMode("H", "a") # 平假名 -> ascii (罗马字)
|
||||
kks.setMode("K", "a") # 片假名 -> ascii
|
||||
kks.setMode("J", "a") # 汉字 -> ascii
|
||||
kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字
|
||||
conv = kks.getConverter()
|
||||
|
||||
|
||||
def all_in_kana(text: str) -> str:
|
||||
"""
|
||||
将输入统一转换为平假名,支持:
|
||||
- 平假名
|
||||
- 片假名
|
||||
- 罗马字 (Hepburn 转写)
|
||||
|
||||
返回:平假名字符串
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 1. 片假名 → 平假名
|
||||
normalized = jaconv.kata2hira(text)
|
||||
|
||||
# 2. 如果里面含有罗马字字符,就先转成假名
|
||||
if any("a" <= ch.lower() <= "z" for ch in normalized):
|
||||
hira = conv.do(normalized) # 罗马字 -> 平假名
|
||||
normalized = jaconv.kata2hira(hira)
|
||||
|
||||
# 3. 再次片假名 -> 平假名保险
|
||||
normalized = jaconv.kata2hira(normalized)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
@dict_search.post("/search", response_model=SearchResponse)
|
||||
async def search(request: Request, body: SearchRequest, user=Depends(get_current_user)):
|
||||
|
|
@ -107,5 +75,13 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current
|
|||
contents=contents,
|
||||
)
|
||||
|
||||
|
||||
# TODO 相关度排序(转换为模糊匹配)
|
||||
# TODO 输入搜索框时反馈内容
|
||||
|
||||
# @dict_search.post("search/list")
|
||||
# async def search_list(body: SearchRequest, user=Depends(get_current_user)):
|
||||
# query = body.query
|
||||
# if body.language == 'fr':
|
||||
# query = normalize_text(query)
|
||||
# prefix = await DefinitionFr.filter(word__text__icontains=query)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
import jaconv
|
||||
import pykakasi
|
||||
|
||||
kks = pykakasi.kakasi()
|
||||
kks.setMode("H", "a") # 平假名 -> ascii (罗马字)
|
||||
kks.setMode("K", "a") # 片假名 -> ascii
|
||||
kks.setMode("J", "a") # 汉字 -> ascii
|
||||
kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字
|
||||
conv = kks.getConverter()
|
||||
|
||||
|
||||
def all_in_kana(text: str) -> str:
|
||||
"""
|
||||
将输入统一转换为平假名,支持:
|
||||
- 平假名
|
||||
- 片假名
|
||||
- 罗马字 (Hepburn 转写)
|
||||
|
||||
返回:平假名字符串
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 1. 片假名 → 平假名
|
||||
normalized = jaconv.kata2hira(text)
|
||||
|
||||
# 2. 如果里面含有罗马字字符,就先转成假名
|
||||
if any("a" <= ch.lower() <= "z" for ch in normalized):
|
||||
hira = conv.do(normalized) # 罗马字 -> 平假名
|
||||
normalized = jaconv.kata2hira(hira)
|
||||
|
||||
# 3. 再次片假名 -> 平假名保险
|
||||
normalized = jaconv.kata2hira(normalized)
|
||||
|
||||
return normalized
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
import asyncio
|
||||
|
||||
from tortoise import Tortoise
|
||||
from tortoise.expressions import Q
|
||||
from typing import List, Literal, Tuple
|
||||
|
||||
from app.models import WordlistFr, WordlistJp
|
||||
from app.schemas.search_schemas import SearchRequest
|
||||
from app.utils.all_kana import all_in_kana
|
||||
from app.utils.textnorm import normalize_text
|
||||
from settings import TORTOISE_ORM
|
||||
|
||||
|
||||
async def suggest_autocomplete(query: SearchRequest, limit: int = 10) -> List[str]:
|
||||
"""
|
||||
|
||||
:param query: 当前用户输入的内容
|
||||
:param limit: 返回列表限制长度
|
||||
:return: 联想的单词列表(非完整信息,单纯单词)
|
||||
"""
|
||||
if query.language == 'fr':
|
||||
query_word = normalize_text(query.query)
|
||||
qs_prefix = await (
|
||||
WordlistFr
|
||||
.filter(search_text__startswith=query_word)
|
||||
.only("text", "freq")
|
||||
)
|
||||
prefix_objs = qs_prefix[:limit]
|
||||
prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs]
|
||||
|
||||
need = max(0, limit - len(prefix))
|
||||
contains: List[Tuple[str, int]] = []
|
||||
|
||||
if need > 0:
|
||||
qs_contain = await (
|
||||
WordlistFr
|
||||
.filter(search_text__icontains=query_word)
|
||||
.exclude(search_text__startswith=query_word)
|
||||
.only("text", "freq")
|
||||
)
|
||||
contains_objs = qs_contain[:need * 2]
|
||||
contains: List[Tuple[str, int]] = [(o.text, o.freq) for o in contains_objs]
|
||||
|
||||
else:
|
||||
query_word = all_in_kana(query.query)
|
||||
|
||||
qs_prefix = await (
|
||||
WordlistJp
|
||||
.filter(hiragana__startswith=query_word)
|
||||
.only("text", "freq")
|
||||
)
|
||||
|
||||
prefix_objs = qs_prefix[:limit]
|
||||
prefix: List[Tuple[str, int]] = [(o.text, o.freq) for o in prefix_objs]
|
||||
|
||||
need = max(0, limit - len(prefix))
|
||||
contains = []
|
||||
if need > 0:
|
||||
qs_contain = await (
|
||||
WordlistJp
|
||||
.filter(Q(hiragana__icontains=query_word) | Q(text__icontains=query_word))
|
||||
.exclude(Q(hiragana__startswith=query_word) | Q(text__startswith=query_word))
|
||||
.only("text", "freq")
|
||||
)
|
||||
contains_objs = qs_contain[:need * 2]
|
||||
contains: List[Tuple[str, int]] = [(o.text, o.freq) for o in contains_objs]
|
||||
|
||||
seen_text, out = set(), []
|
||||
for word in list(qs_prefix) + list(contains):
|
||||
if word.text not in seen_text:
|
||||
seen_text.add(word.text)
|
||||
out.append((word.text, word.freq))
|
||||
if len(out) >= limit:
|
||||
break
|
||||
out = sorted(out, key=lambda w: (w[1], len(w[0]), w[0]))
|
||||
return [text for text, _ in out]
|
||||
|
||||
|
||||
async def __test():
|
||||
query_word: str = 'あい'
|
||||
language: Literal['fr', 'jp'] = 'jp'
|
||||
return await suggest_autocomplete(SearchRequest(query=query_word, language=language))
|
||||
|
||||
|
||||
async def __main():
|
||||
await Tortoise.init(config=TORTOISE_ORM)
|
||||
print(await __test())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(__main())
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
jaconv
|
||||
fugashi
|
||||
pykakasi
|
||||
aerich==0.9.1
|
||||
|
|
|
|||
Loading…
Reference in New Issue