更新谚语搜索,调整搜索函数

This commit is contained in:
Miyamizu-MitsuhaSang 2025-11-03 13:01:25 +08:00
parent fe6c84e310
commit 2edd3e7a56
9 changed files with 197 additions and 14 deletions

View File

@ -5,10 +5,11 @@ from fastapi import APIRouter, Depends, HTTPException, Request, Form
from app.api.search_dict import service
from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \
ProverbSearchRequest
from app.api.search_dict.service import suggest_autocomplete
from app.api.search_dict.service import suggest_autocomplete, accurate_proverb
from app.api.word_comment.word_comment_schemas import CommentSet
from app.models import DefinitionJp, CommentFr, CommentJp
from app.models.fr import DefinitionFr, ProverbFr
from app.models.jp import IdiomJp
from app.utils.all_kana import all_in_kana
from app.utils.security import get_current_user
from app.utils.textnorm import normalize_text
@ -165,8 +166,8 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
@dict_search.post("/search/proverb/list")
async def search_proverb_list(query_word: ProverbSearchRequest):
lang = service.detect_language(text=query_word.query)
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
lang = service.detect_language(text=query_word.query)[1]
query = normalize_text(query_word.query) if lang == "fr" else query_word.query
suggest_proverbs = await service.suggest_proverb(
query=query_word.query,
@ -174,10 +175,40 @@ async def search_proverb_list(query_word: ProverbSearchRequest):
model=ProverbFr,
search_field="search_text",
)
# TODO 使用法语词典时是否存在用英语输入的情况
return {"list": suggest_proverbs}
@dict_search.post("/search/proverb")
async def search_proverb(proverb_id:int = Form(...), user=Depends(get_current_user)):
async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)):
result = await service.accurate_proverb(proverb_id=proverb_id)
return {"result": result}
@dict_search.post("/search/idiom/list")
async def search_idiom_list(query_idiom: ProverbSearchRequest):
if query_idiom.dict_language == "fr":
raise HTTPException(status_code=400, detail="Dict language Error")
trad_query, lang = service.detect_language(text=query_idiom.query)
query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query
result = await service.suggest_proverb(
query=query,
lang=lang,
model=IdiomJp,
search_field="search_text",
target_field="text",
)
if lang == "zh":
trad_query = all_in_kana(text=query_idiom.query)
search_idioms_from_chi = await service.suggest_proverb(
query=trad_query,
lang="jp",
model=IdiomJp,
)
result[:0] = search_idioms_from_chi
return {"list": result}
@dict_search.post("/search/idiom")
async def search_idiom(query_id: int):
result = await accurate_proverb(proverb_id=query_id)
return {"result": result}

View File

@ -2,6 +2,7 @@ import re
from typing import List, Tuple, Dict, Literal, Type
from fastapi import HTTPException
from opencc import OpenCC
from tortoise import Tortoise, Model
from tortoise.expressions import Q
@ -13,18 +14,50 @@ from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM
def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]:
def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]:
"""
自动检测输入语言:
返回 'zh' / 'jp' / 'fr' / 'other'
- zh: 简体中文
- jp: 日语含假名或繁体/旧体字
- fr: 拉丁字母法语等
- other: 其他
"""
cc_s2t = OpenCC('s2t') # 简体 → 繁体
cc_t2s = OpenCC('t2s') # 繁体 → 简体
JAPANESE_HIRAGANA = r"[\u3040-\u309F]"
JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]"
text = text.strip()
if not text:
return "", "other"
# ✅ Step 1: 假名检测
if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text):
return text, "jp"
# ✅ Step 2: 汉字检测
if re.search(r"[\u4e00-\u9fff]", text):
return "zh"
elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围
return "jp"
elif re.search(r"[a-zA-ZÀ-ÿ]", text):
return "fr"
return "other"
# 简繁互转对比
to_trad = cc_s2t.convert(text)
to_simp = cc_t2s.convert(text)
# 如果输入等于繁体转换结果 → 繁体或日文汉字
if text == to_trad and text != to_simp:
return text, "jp"
# 如果输入等于简体转换结果 → 简体中文
elif text == to_simp and text != to_trad:
return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索
# 否则混合(既有简体又有繁体)
else:
# 混合时可优先认定为繁体(日语)
return to_trad, "jp"
# ✅ Step 3: 拉丁字母检测
if re.search(r"[a-zA-ZÀ-ÿ]", text):
return text, "fr"
return text, "other"
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
@ -32,11 +65,21 @@ async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
proverb = await ProverbFr.get_or_none(id=proverb_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
proverb.freq = proverb.freq + 1
await proverb.save()
return ProverbSearchResponse(
proverb_text=proverb.text,
chi_exp=proverb.chi_exp,
)
async def accurate_idiom(idiom_id: int):
proverb = await ProverbFr.get_or_none(id=idiom_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
proverb.freq = proverb.freq + 1
await proverb.save()
return proverb
async def suggest_proverb(
query: str,

View File

@ -2,4 +2,4 @@ from . import signals
from .base import User
from .comments import CommentFr, CommentJp
from .fr import WordlistFr, DefinitionFr, AttachmentFr, PronunciationTestFr
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp, IdiomJp, KangjiMapping

View File

@ -94,7 +94,18 @@ class IdiomJp(Model):
chi_exp = fields.TextField(null=False)
example = fields.TextField(null=False)
search_text = fields.TextField(null=False)
freq = fields.IntField(defualt=0, null=False)
created_at = fields.DatetimeField(auto_now_add=True)
class Meta:
table = "idiom_jp"
class KangjiMapping(Model):
id = fields.IntField(pk=True)
hanzi= fields.TextField(null=False)
kangji= fields.TextField(null=False)
note= fields.TextField(null=False)
created_at = fields.DatetimeField(auto_now_add=True)
class Meta:
table = "kangji_mapping_zh_jp"

0
intro/__init__.py Normal file
View File

View File

@ -0,0 +1,37 @@
import asyncio
from pathlib import Path
import pandas as pd
from tortoise import Tortoise
from app.models import KangjiMapping
from settings import TORTOISE_ORM
class JapaneseIntro:
kangji_mapping : Path = Path("./中日汉字映射表_自动扩充_约3000条.xlsx")
@classmethod
async def kangji_mapping_intro(cls):
df = pd.read_excel(cls.kangji_mapping)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
hanzi = row[1]
kangji = row[2]
note = row[4]
mapping = await KangjiMapping.create(
hanzi=hanzi,
kangji=kangji,
note=note,
)
print("导入完成")
async def main():
await Tortoise.init(config=TORTOISE_ORM)
await KangjiMapping.all().delete()
await JapaneseIntro.kangji_mapping_intro()
if __name__ == '__main__':
asyncio.run(main())

View File

@ -33,6 +33,7 @@ jaconv==0.4.0
jiter==0.11.1
numpy==2.3.2
openai==2.6.1
opencc-python-reimplemented==0.1.7
openpyxl==3.1.5
pandas==2.3.1
pandas-stubs==2.3.2.250827

0
scripts/jp/__init__.py Normal file
View File

60
scripts/jp/chi2kangji.py Normal file
View File

@ -0,0 +1,60 @@
import io
import pandas as pd
import re
import requests
import zipfile
url = "https://www.unicode.org/Public/15.1.0/ucd/Unihan.zip"
print("📦 正在下载 Unihan 数据包...")
r = requests.get(url)
r.raise_for_status()
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
txt = z.read("Unihan_Variants.txt").decode("utf-8") + \
"\n" + z.read("Unihan_Readings.txt").decode("utf-8")
print("✅ 数据加载成功")
# --- 匹配所需字段 ---
re_simpl = re.compile(r"U\+([0-9A-F]+)\tkSimplifiedVariant\t(U\+[0-9A-F]+)")
re_zvar = re.compile(r"U\+([0-9A-F]+)\tkZVariant\t(U\+[0-9A-F]+)")
re_jp_on = re.compile(r"U\+([0-9A-F]+)\tkJapaneseOn\t(.+)")
re_jp_kun = re.compile(r"U\+([0-9A-F]+)\tkJapaneseKun\t(.+)")
simpl_map, zvar_map, jp_on, jp_kun = {}, {}, {}, {}
for m in re_simpl.finditer(txt):
trad_hex, simp_hex = m.groups()
trad, simp = chr(int(trad_hex, 16)), chr(int(simp_hex, 16))
simpl_map[trad] = simp
for m in re_zvar.finditer(txt):
base_hex, var_hex = m.groups()
base, var = chr(int(base_hex, 16)), chr(int(var_hex, 16))
zvar_map[base] = var
for m in re_jp_on.finditer(txt):
code_hex, reading = m.groups()
char = chr(int(code_hex, 16))
jp_on[char] = reading.replace(" ", "")
for m in re_jp_kun.finditer(txt):
code_hex, reading = m.groups()
char = chr(int(code_hex, 16))
jp_kun[char] = reading.replace(" ", "")
rows = []
for trad, simp in simpl_map.items():
# 关键:找繁体→日语新字体的异体关系
if trad in zvar_map:
jp_char = zvar_map[trad]
if jp_char in jp_on or jp_char in jp_kun:
kana_on = jp_on.get(jp_char, "")
kana_kun = jp_kun.get(jp_char, "")
kana = kana_on + ("" + kana_kun if kana_on and kana_kun else kana_kun)
rows.append([simp, trad, jp_char, kana, "", "由繁体→简体+异体→日语新字体推导"])
df = pd.DataFrame(rows, columns=["简体汉字", "繁体汉字", "日语汉字", "假名读音", "是否异体", "备注"])
df.to_excel("中日汉字映射表_六列综合版.xlsx", index=False)
print(f"✅ 已生成文件,共 {len(df)} 条记录。")