diff --git a/app/api/search_dict/routes.py b/app/api/search_dict/routes.py index 7e57937..7ebc442 100644 --- a/app/api/search_dict/routes.py +++ b/app/api/search_dict/routes.py @@ -5,10 +5,11 @@ from fastapi import APIRouter, Depends, HTTPException, Request, Form from app.api.search_dict import service from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \ ProverbSearchRequest -from app.api.search_dict.service import suggest_autocomplete +from app.api.search_dict.service import suggest_autocomplete, accurate_proverb from app.api.word_comment.word_comment_schemas import CommentSet from app.models import DefinitionJp, CommentFr, CommentJp from app.models.fr import DefinitionFr, ProverbFr +from app.models.jp import IdiomJp from app.utils.all_kana import all_in_kana from app.utils.security import get_current_user from app.utils.textnorm import normalize_text @@ -165,8 +166,8 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u @dict_search.post("/search/proverb/list") -async def search_proverb_list(query_word: ProverbSearchRequest): - lang = service.detect_language(text=query_word.query) +async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)): + lang = service.detect_language(text=query_word.query)[1] query = normalize_text(query_word.query) if lang == "fr" else query_word.query suggest_proverbs = await service.suggest_proverb( query=query_word.query, @@ -174,10 +175,40 @@ async def search_proverb_list(query_word: ProverbSearchRequest): model=ProverbFr, search_field="search_text", ) - # TODO 使用法语词典时是否存在用英语输入的情况 return {"list": suggest_proverbs} + @dict_search.post("/search/proverb") -async def search_proverb(proverb_id:int = Form(...), user=Depends(get_current_user)): +async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)): result = await service.accurate_proverb(proverb_id=proverb_id) + + return {"result": result} + + +@dict_search.post("/search/idiom/list") +async def search_idiom_list(query_idiom: ProverbSearchRequest): + if query_idiom.dict_language == "fr": + raise HTTPException(status_code=400, detail="Dict language Error") + trad_query, lang = service.detect_language(text=query_idiom.query) + query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query + result = await service.suggest_proverb( + query=query, + lang=lang, + model=IdiomJp, + search_field="search_text", + target_field="text", + ) + if lang == "zh": + trad_query = all_in_kana(text=query_idiom.query) + search_idioms_from_chi = await service.suggest_proverb( + query=trad_query, + lang="jp", + model=IdiomJp, + ) + result[:0] = search_idioms_from_chi + return {"list": result} + +@dict_search.post("/search/idiom") +async def search_idiom(query_id: int): + result = await accurate_proverb(proverb_id=query_id) return {"result": result} diff --git a/app/api/search_dict/service.py b/app/api/search_dict/service.py index 11cfb55..5543252 100644 --- a/app/api/search_dict/service.py +++ b/app/api/search_dict/service.py @@ -2,6 +2,7 @@ import re from typing import List, Tuple, Dict, Literal, Type from fastapi import HTTPException +from opencc import OpenCC from tortoise import Tortoise, Model from tortoise.expressions import Q @@ -13,18 +14,50 @@ from app.utils.textnorm import normalize_text from settings import TORTOISE_ORM -def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]: +def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]: """ 自动检测输入语言: - 返回 'zh' / 'jp' / 'fr' / 'other' + - zh: 简体中文 + - jp: 日语(含假名或繁体/旧体字) + - fr: 拉丁字母(法语等) + - other: 其他 """ + cc_s2t = OpenCC('s2t') # 简体 → 繁体 + cc_t2s = OpenCC('t2s') # 繁体 → 简体 + + JAPANESE_HIRAGANA = r"[\u3040-\u309F]" + JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]" + + text = text.strip() + if not text: + return "", "other" + + # ✅ Step 1: 假名检测 + if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text): + return text, "jp" + + # ✅ Step 2: 汉字检测 if re.search(r"[\u4e00-\u9fff]", text): - return "zh" - elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围 - return "jp" - elif re.search(r"[a-zA-ZÀ-ÿ]", text): - return "fr" - return "other" + # 简繁互转对比 + to_trad = cc_s2t.convert(text) + to_simp = cc_t2s.convert(text) + + # 如果输入等于繁体转换结果 → 繁体或日文汉字 + if text == to_trad and text != to_simp: + return text, "jp" + # 如果输入等于简体转换结果 → 简体中文 + elif text == to_simp and text != to_trad: + return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索 + # 否则混合(既有简体又有繁体) + else: + # 混合时可优先认定为繁体(日语) + return to_trad, "jp" + + # ✅ Step 3: 拉丁字母检测 + if re.search(r"[a-zA-ZÀ-ÿ]", text): + return text, "fr" + + return text, "other" async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: @@ -32,11 +65,21 @@ async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: proverb = await ProverbFr.get_or_none(id=proverb_id) if not proverb: raise HTTPException(status_code=404, detail="Proverb not found") + proverb.freq = proverb.freq + 1 + await proverb.save() return ProverbSearchResponse( proverb_text=proverb.text, chi_exp=proverb.chi_exp, ) +async def accurate_idiom(idiom_id: int): + proverb = await ProverbFr.get_or_none(id=idiom_id) + if not proverb: + raise HTTPException(status_code=404, detail="Proverb not found") + proverb.freq = proverb.freq + 1 + await proverb.save() + return proverb + async def suggest_proverb( query: str, diff --git a/app/models/__init__.py b/app/models/__init__.py index 310b788..721db0e 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -2,4 +2,4 @@ from . import signals from .base import User from .comments import CommentFr, CommentJp from .fr import WordlistFr, DefinitionFr, AttachmentFr, PronunciationTestFr -from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp +from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp, IdiomJp, KangjiMapping diff --git a/app/models/jp.py b/app/models/jp.py index 40d1b99..ee166a2 100644 --- a/app/models/jp.py +++ b/app/models/jp.py @@ -94,7 +94,18 @@ class IdiomJp(Model): chi_exp = fields.TextField(null=False) example = fields.TextField(null=False) search_text = fields.TextField(null=False) + freq = fields.IntField(defualt=0, null=False) created_at = fields.DatetimeField(auto_now_add=True) class Meta: table = "idiom_jp" + +class KangjiMapping(Model): + id = fields.IntField(pk=True) + hanzi= fields.TextField(null=False) + kangji= fields.TextField(null=False) + note= fields.TextField(null=False) + created_at = fields.DatetimeField(auto_now_add=True) + + class Meta: + table = "kangji_mapping_zh_jp" diff --git a/intro/__init__.py b/intro/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/intro/chi2kangji_table_intro.py b/intro/chi2kangji_table_intro.py new file mode 100644 index 0000000..d496037 --- /dev/null +++ b/intro/chi2kangji_table_intro.py @@ -0,0 +1,37 @@ +import asyncio +from pathlib import Path + +import pandas as pd +from tortoise import Tortoise + +from app.models import KangjiMapping +from settings import TORTOISE_ORM + + +class JapaneseIntro: + kangji_mapping : Path = Path("./中日汉字映射表_自动扩充_约3000条.xlsx") + + @classmethod + async def kangji_mapping_intro(cls): + df = pd.read_excel(cls.kangji_mapping) + df.columns = [col.strip() for col in df.columns] + + for row in df.itertuples(): + hanzi = row[1] + kangji = row[2] + note = row[4] + + mapping = await KangjiMapping.create( + hanzi=hanzi, + kangji=kangji, + note=note, + ) + print("导入完成") + +async def main(): + await Tortoise.init(config=TORTOISE_ORM) + await KangjiMapping.all().delete() + await JapaneseIntro.kangji_mapping_intro() + +if __name__ == '__main__': + asyncio.run(main()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 125ee28..62ff0e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,6 +33,7 @@ jaconv==0.4.0 jiter==0.11.1 numpy==2.3.2 openai==2.6.1 +opencc-python-reimplemented==0.1.7 openpyxl==3.1.5 pandas==2.3.1 pandas-stubs==2.3.2.250827 diff --git a/scripts/jp/__init__.py b/scripts/jp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/jp/chi2kangji.py b/scripts/jp/chi2kangji.py new file mode 100644 index 0000000..2c12341 --- /dev/null +++ b/scripts/jp/chi2kangji.py @@ -0,0 +1,60 @@ +import io +import pandas as pd +import re +import requests +import zipfile + +url = "https://www.unicode.org/Public/15.1.0/ucd/Unihan.zip" +print("📦 正在下载 Unihan 数据包...") +r = requests.get(url) +r.raise_for_status() + +with zipfile.ZipFile(io.BytesIO(r.content)) as z: + txt = z.read("Unihan_Variants.txt").decode("utf-8") + \ + "\n" + z.read("Unihan_Readings.txt").decode("utf-8") + +print("✅ 数据加载成功") + +# --- 匹配所需字段 --- +re_simpl = re.compile(r"U\+([0-9A-F]+)\tkSimplifiedVariant\t(U\+[0-9A-F]+)") +re_zvar = re.compile(r"U\+([0-9A-F]+)\tkZVariant\t(U\+[0-9A-F]+)") +re_jp_on = re.compile(r"U\+([0-9A-F]+)\tkJapaneseOn\t(.+)") +re_jp_kun = re.compile(r"U\+([0-9A-F]+)\tkJapaneseKun\t(.+)") + +simpl_map, zvar_map, jp_on, jp_kun = {}, {}, {}, {} + +for m in re_simpl.finditer(txt): + trad_hex, simp_hex = m.groups() + trad, simp = chr(int(trad_hex, 16)), chr(int(simp_hex, 16)) + simpl_map[trad] = simp + +for m in re_zvar.finditer(txt): + base_hex, var_hex = m.groups() + base, var = chr(int(base_hex, 16)), chr(int(var_hex, 16)) + zvar_map[base] = var + +for m in re_jp_on.finditer(txt): + code_hex, reading = m.groups() + char = chr(int(code_hex, 16)) + jp_on[char] = reading.replace(" ", "、") + +for m in re_jp_kun.finditer(txt): + code_hex, reading = m.groups() + char = chr(int(code_hex, 16)) + jp_kun[char] = reading.replace(" ", "、") + +rows = [] +for trad, simp in simpl_map.items(): + # 关键:找繁体→日语新字体的异体关系 + if trad in zvar_map: + jp_char = zvar_map[trad] + if jp_char in jp_on or jp_char in jp_kun: + kana_on = jp_on.get(jp_char, "") + kana_kun = jp_kun.get(jp_char, "") + kana = kana_on + ("、" + kana_kun if kana_on and kana_kun else kana_kun) + rows.append([simp, trad, jp_char, kana, "是", "由繁体→简体+异体→日语新字体推导"]) + +df = pd.DataFrame(rows, columns=["简体汉字", "繁体汉字", "日语汉字", "假名读音", "是否异体", "备注"]) +df.to_excel("中日汉字映射表_六列综合版.xlsx", index=False) + +print(f"✅ 已生成文件,共 {len(df)} 条记录。")