更新谚语搜索,调整搜索函数
This commit is contained in:
parent
fe6c84e310
commit
2edd3e7a56
|
|
@ -5,10 +5,11 @@ from fastapi import APIRouter, Depends, HTTPException, Request, Form
|
|||
from app.api.search_dict import service
|
||||
from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \
|
||||
ProverbSearchRequest
|
||||
from app.api.search_dict.service import suggest_autocomplete
|
||||
from app.api.search_dict.service import suggest_autocomplete, accurate_proverb
|
||||
from app.api.word_comment.word_comment_schemas import CommentSet
|
||||
from app.models import DefinitionJp, CommentFr, CommentJp
|
||||
from app.models.fr import DefinitionFr, ProverbFr
|
||||
from app.models.jp import IdiomJp
|
||||
from app.utils.all_kana import all_in_kana
|
||||
from app.utils.security import get_current_user
|
||||
from app.utils.textnorm import normalize_text
|
||||
|
|
@ -165,8 +166,8 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
|
|||
|
||||
|
||||
@dict_search.post("/search/proverb/list")
|
||||
async def search_proverb_list(query_word: ProverbSearchRequest):
|
||||
lang = service.detect_language(text=query_word.query)
|
||||
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
|
||||
lang = service.detect_language(text=query_word.query)[1]
|
||||
query = normalize_text(query_word.query) if lang == "fr" else query_word.query
|
||||
suggest_proverbs = await service.suggest_proverb(
|
||||
query=query_word.query,
|
||||
|
|
@ -174,10 +175,40 @@ async def search_proverb_list(query_word: ProverbSearchRequest):
|
|||
model=ProverbFr,
|
||||
search_field="search_text",
|
||||
)
|
||||
# TODO 使用法语词典时是否存在用英语输入的情况
|
||||
return {"list": suggest_proverbs}
|
||||
|
||||
|
||||
@dict_search.post("/search/proverb")
|
||||
async def search_proverb(proverb_id:int = Form(...), user=Depends(get_current_user)):
|
||||
async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)):
|
||||
result = await service.accurate_proverb(proverb_id=proverb_id)
|
||||
|
||||
return {"result": result}
|
||||
|
||||
|
||||
@dict_search.post("/search/idiom/list")
|
||||
async def search_idiom_list(query_idiom: ProverbSearchRequest):
|
||||
if query_idiom.dict_language == "fr":
|
||||
raise HTTPException(status_code=400, detail="Dict language Error")
|
||||
trad_query, lang = service.detect_language(text=query_idiom.query)
|
||||
query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query
|
||||
result = await service.suggest_proverb(
|
||||
query=query,
|
||||
lang=lang,
|
||||
model=IdiomJp,
|
||||
search_field="search_text",
|
||||
target_field="text",
|
||||
)
|
||||
if lang == "zh":
|
||||
trad_query = all_in_kana(text=query_idiom.query)
|
||||
search_idioms_from_chi = await service.suggest_proverb(
|
||||
query=trad_query,
|
||||
lang="jp",
|
||||
model=IdiomJp,
|
||||
)
|
||||
result[:0] = search_idioms_from_chi
|
||||
return {"list": result}
|
||||
|
||||
@dict_search.post("/search/idiom")
|
||||
async def search_idiom(query_id: int):
|
||||
result = await accurate_proverb(proverb_id=query_id)
|
||||
return {"result": result}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import re
|
|||
from typing import List, Tuple, Dict, Literal, Type
|
||||
|
||||
from fastapi import HTTPException
|
||||
from opencc import OpenCC
|
||||
from tortoise import Tortoise, Model
|
||||
from tortoise.expressions import Q
|
||||
|
||||
|
|
@ -13,18 +14,50 @@ from app.utils.textnorm import normalize_text
|
|||
from settings import TORTOISE_ORM
|
||||
|
||||
|
||||
def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]:
|
||||
def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]:
|
||||
"""
|
||||
自动检测输入语言:
|
||||
返回 'zh' / 'jp' / 'fr' / 'other'
|
||||
- zh: 简体中文
|
||||
- jp: 日语(含假名或繁体/旧体字)
|
||||
- fr: 拉丁字母(法语等)
|
||||
- other: 其他
|
||||
"""
|
||||
cc_s2t = OpenCC('s2t') # 简体 → 繁体
|
||||
cc_t2s = OpenCC('t2s') # 繁体 → 简体
|
||||
|
||||
JAPANESE_HIRAGANA = r"[\u3040-\u309F]"
|
||||
JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]"
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return "", "other"
|
||||
|
||||
# ✅ Step 1: 假名检测
|
||||
if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text):
|
||||
return text, "jp"
|
||||
|
||||
# ✅ Step 2: 汉字检测
|
||||
if re.search(r"[\u4e00-\u9fff]", text):
|
||||
return "zh"
|
||||
elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围
|
||||
return "jp"
|
||||
elif re.search(r"[a-zA-ZÀ-ÿ]", text):
|
||||
return "fr"
|
||||
return "other"
|
||||
# 简繁互转对比
|
||||
to_trad = cc_s2t.convert(text)
|
||||
to_simp = cc_t2s.convert(text)
|
||||
|
||||
# 如果输入等于繁体转换结果 → 繁体或日文汉字
|
||||
if text == to_trad and text != to_simp:
|
||||
return text, "jp"
|
||||
# 如果输入等于简体转换结果 → 简体中文
|
||||
elif text == to_simp and text != to_trad:
|
||||
return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索
|
||||
# 否则混合(既有简体又有繁体)
|
||||
else:
|
||||
# 混合时可优先认定为繁体(日语)
|
||||
return to_trad, "jp"
|
||||
|
||||
# ✅ Step 3: 拉丁字母检测
|
||||
if re.search(r"[a-zA-ZÀ-ÿ]", text):
|
||||
return text, "fr"
|
||||
|
||||
return text, "other"
|
||||
|
||||
|
||||
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
|
||||
|
|
@ -32,11 +65,21 @@ async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
|
|||
proverb = await ProverbFr.get_or_none(id=proverb_id)
|
||||
if not proverb:
|
||||
raise HTTPException(status_code=404, detail="Proverb not found")
|
||||
proverb.freq = proverb.freq + 1
|
||||
await proverb.save()
|
||||
return ProverbSearchResponse(
|
||||
proverb_text=proverb.text,
|
||||
chi_exp=proverb.chi_exp,
|
||||
)
|
||||
|
||||
async def accurate_idiom(idiom_id: int):
|
||||
proverb = await ProverbFr.get_or_none(id=idiom_id)
|
||||
if not proverb:
|
||||
raise HTTPException(status_code=404, detail="Proverb not found")
|
||||
proverb.freq = proverb.freq + 1
|
||||
await proverb.save()
|
||||
return proverb
|
||||
|
||||
|
||||
async def suggest_proverb(
|
||||
query: str,
|
||||
|
|
|
|||
|
|
@ -2,4 +2,4 @@ from . import signals
|
|||
from .base import User
|
||||
from .comments import CommentFr, CommentJp
|
||||
from .fr import WordlistFr, DefinitionFr, AttachmentFr, PronunciationTestFr
|
||||
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp
|
||||
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp, IdiomJp, KangjiMapping
|
||||
|
|
|
|||
|
|
@ -94,7 +94,18 @@ class IdiomJp(Model):
|
|||
chi_exp = fields.TextField(null=False)
|
||||
example = fields.TextField(null=False)
|
||||
search_text = fields.TextField(null=False)
|
||||
freq = fields.IntField(defualt=0, null=False)
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
table = "idiom_jp"
|
||||
|
||||
class KangjiMapping(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
hanzi= fields.TextField(null=False)
|
||||
kangji= fields.TextField(null=False)
|
||||
note= fields.TextField(null=False)
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
table = "kangji_mapping_zh_jp"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from tortoise import Tortoise
|
||||
|
||||
from app.models import KangjiMapping
|
||||
from settings import TORTOISE_ORM
|
||||
|
||||
|
||||
class JapaneseIntro:
|
||||
kangji_mapping : Path = Path("./中日汉字映射表_自动扩充_约3000条.xlsx")
|
||||
|
||||
@classmethod
|
||||
async def kangji_mapping_intro(cls):
|
||||
df = pd.read_excel(cls.kangji_mapping)
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
for row in df.itertuples():
|
||||
hanzi = row[1]
|
||||
kangji = row[2]
|
||||
note = row[4]
|
||||
|
||||
mapping = await KangjiMapping.create(
|
||||
hanzi=hanzi,
|
||||
kangji=kangji,
|
||||
note=note,
|
||||
)
|
||||
print("导入完成")
|
||||
|
||||
async def main():
|
||||
await Tortoise.init(config=TORTOISE_ORM)
|
||||
await KangjiMapping.all().delete()
|
||||
await JapaneseIntro.kangji_mapping_intro()
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
|
|
@ -33,6 +33,7 @@ jaconv==0.4.0
|
|||
jiter==0.11.1
|
||||
numpy==2.3.2
|
||||
openai==2.6.1
|
||||
opencc-python-reimplemented==0.1.7
|
||||
openpyxl==3.1.5
|
||||
pandas==2.3.1
|
||||
pandas-stubs==2.3.2.250827
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
import io
|
||||
import pandas as pd
|
||||
import re
|
||||
import requests
|
||||
import zipfile
|
||||
|
||||
url = "https://www.unicode.org/Public/15.1.0/ucd/Unihan.zip"
|
||||
print("📦 正在下载 Unihan 数据包...")
|
||||
r = requests.get(url)
|
||||
r.raise_for_status()
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
|
||||
txt = z.read("Unihan_Variants.txt").decode("utf-8") + \
|
||||
"\n" + z.read("Unihan_Readings.txt").decode("utf-8")
|
||||
|
||||
print("✅ 数据加载成功")
|
||||
|
||||
# --- 匹配所需字段 ---
|
||||
re_simpl = re.compile(r"U\+([0-9A-F]+)\tkSimplifiedVariant\t(U\+[0-9A-F]+)")
|
||||
re_zvar = re.compile(r"U\+([0-9A-F]+)\tkZVariant\t(U\+[0-9A-F]+)")
|
||||
re_jp_on = re.compile(r"U\+([0-9A-F]+)\tkJapaneseOn\t(.+)")
|
||||
re_jp_kun = re.compile(r"U\+([0-9A-F]+)\tkJapaneseKun\t(.+)")
|
||||
|
||||
simpl_map, zvar_map, jp_on, jp_kun = {}, {}, {}, {}
|
||||
|
||||
for m in re_simpl.finditer(txt):
|
||||
trad_hex, simp_hex = m.groups()
|
||||
trad, simp = chr(int(trad_hex, 16)), chr(int(simp_hex, 16))
|
||||
simpl_map[trad] = simp
|
||||
|
||||
for m in re_zvar.finditer(txt):
|
||||
base_hex, var_hex = m.groups()
|
||||
base, var = chr(int(base_hex, 16)), chr(int(var_hex, 16))
|
||||
zvar_map[base] = var
|
||||
|
||||
for m in re_jp_on.finditer(txt):
|
||||
code_hex, reading = m.groups()
|
||||
char = chr(int(code_hex, 16))
|
||||
jp_on[char] = reading.replace(" ", "、")
|
||||
|
||||
for m in re_jp_kun.finditer(txt):
|
||||
code_hex, reading = m.groups()
|
||||
char = chr(int(code_hex, 16))
|
||||
jp_kun[char] = reading.replace(" ", "、")
|
||||
|
||||
rows = []
|
||||
for trad, simp in simpl_map.items():
|
||||
# 关键:找繁体→日语新字体的异体关系
|
||||
if trad in zvar_map:
|
||||
jp_char = zvar_map[trad]
|
||||
if jp_char in jp_on or jp_char in jp_kun:
|
||||
kana_on = jp_on.get(jp_char, "")
|
||||
kana_kun = jp_kun.get(jp_char, "")
|
||||
kana = kana_on + ("、" + kana_kun if kana_on and kana_kun else kana_kun)
|
||||
rows.append([simp, trad, jp_char, kana, "是", "由繁体→简体+异体→日语新字体推导"])
|
||||
|
||||
df = pd.DataFrame(rows, columns=["简体汉字", "繁体汉字", "日语汉字", "假名读音", "是否异体", "备注"])
|
||||
df.to_excel("中日汉字映射表_六列综合版.xlsx", index=False)
|
||||
|
||||
print(f"✅ 已生成文件,共 {len(df)} 条记录。")
|
||||
Loading…
Reference in New Issue