search.py:

提供autocomplete接口进行联想和提示功能
all_kana.py:
将日语单词转换功能单独设立为功能模块
This commit is contained in:
Miyamizu-MitsuhaSang 2025-09-02 14:20:27 +08:00
parent da14488d4f
commit a18bd82654
2 changed files with 42 additions and 28 deletions

View File

@ -8,6 +8,7 @@ from app.models import DefinitionJp
from app.models.fr import DefinitionFr
from app.schemas.search_schemas import SearchRequest, SearchResponse, SearchItemFr, SearchItemJp
from app.utils.all_kana import all_in_kana
from app.utils.autocomplete import suggest_autocomplete
from app.utils.security import get_current_user
from app.utils.textnorm import normalize_text
from scripts.update_jp import normalize_jp_text
@ -79,9 +80,13 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current
# TODO 相关度排序(转换为模糊匹配)
# TODO 输入搜索框时反馈内容
# @dict_search.post("search/list")
# async def search_list(body: SearchRequest, user=Depends(get_current_user)):
# query = body.query
# if body.language == 'fr':
# query = normalize_text(query)
# prefix = await DefinitionFr.filter(word__text__icontains=query)
@dict_search.post("search/list")
async def search_list(query_word: SearchRequest, user=Depends(get_current_user)):
"""
检索时的提示接口
:param query_word: 用户输入的内容
:param user:
:return: 待选列表
"""
word_contents = await suggest_autocomplete(query=query_word)
return word_contents

View File

@ -1,35 +1,44 @@
import unicodedata
import jaconv
import pykakasi
from pykakasi import kakasi
kks = pykakasi.kakasi()
kks.setMode("H", "a") # 平假名 -> ascii (罗马字)
kks.setMode("K", "a") # 片假名 -> ascii
kks.setMode("J", "a") # 汉字 -> ascii
kks.setMode("r", "Hepburn") # 转换成 Hepburn 罗马字
conv = kks.getConverter()
# ---- 全局初始化(只做一次)----
_kakasi = kakasi()
_kakasi.setMode("J", "H") # Kanji -> Hiragana依据词典近似读音
_kakasi.setMode("K", "H") # Katakana -> Hiragana
_kakasi.setMode("H", "H") # Hiragana -> Hiragana不变
# 可选:保留原文空格/标点;如需去除空格可自行处理
_converter = _kakasi.getConverter()
def all_in_kana(text: str) -> str:
"""
将输入统一转换为平假名支持
- 平假名
- 片假名
- 罗马字 (Hepburn 转写)
返回平假名字符串
将任意日文输入汉字/平假名/片假名/半角假名混排
统一转换为标准化的平假名
"""
if not text:
return ""
# 1. 片假名 → 平假名
normalized = jaconv.kata2hira(text)
# 1) 规格化(全半角/兼容等):避免隐形差异
s = unicodedata.normalize("NFKC", text).strip()
# 2. 如果里面含有罗马字字符,就先转成假名
if any("a" <= ch.lower() <= "z" for ch in normalized):
hira = conv.do(normalized) # 罗马字 -> 平假名
normalized = jaconv.kata2hira(hira)
# 2) 先做假名统一(片假名 -> 平假名;半角片假名也会被 NFKC 规范化)
# 这一步对只有假名的输入能直接得到平假名
s = jaconv.kata2hira(s)
# 3. 再次片假名 -> 平假名保险
normalized = jaconv.kata2hira(normalized)
# 3) 用 pykakasi 将汉字(以及残留的片假名)转换为“平假名读音”
# - 对纯假名基本保持不变
# - 对汉字给出近似读音(依赖内置词典,个别专有名词可能不完美)
hira = _converter.do(s)
return normalized
# 4) 兜底:再转一次平假名,保证输出统一
hira = jaconv.kata2hira(hira)
# 5) 可选清洗:去掉多余空白(如果你不想保留空格)
# hira = "".join(hira.split())
return hira
if __name__ == '__main__':
print(all_in_kana('能力'))