更新谚语搜索,调整搜索函数
This commit is contained in:
parent
fe6c84e310
commit
2edd3e7a56
|
|
@ -5,10 +5,11 @@ from fastapi import APIRouter, Depends, HTTPException, Request, Form
|
||||||
from app.api.search_dict import service
|
from app.api.search_dict import service
|
||||||
from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \
|
from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \
|
||||||
ProverbSearchRequest
|
ProverbSearchRequest
|
||||||
from app.api.search_dict.service import suggest_autocomplete
|
from app.api.search_dict.service import suggest_autocomplete, accurate_proverb
|
||||||
from app.api.word_comment.word_comment_schemas import CommentSet
|
from app.api.word_comment.word_comment_schemas import CommentSet
|
||||||
from app.models import DefinitionJp, CommentFr, CommentJp
|
from app.models import DefinitionJp, CommentFr, CommentJp
|
||||||
from app.models.fr import DefinitionFr, ProverbFr
|
from app.models.fr import DefinitionFr, ProverbFr
|
||||||
|
from app.models.jp import IdiomJp
|
||||||
from app.utils.all_kana import all_in_kana
|
from app.utils.all_kana import all_in_kana
|
||||||
from app.utils.security import get_current_user
|
from app.utils.security import get_current_user
|
||||||
from app.utils.textnorm import normalize_text
|
from app.utils.textnorm import normalize_text
|
||||||
|
|
@ -165,8 +166,8 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
|
||||||
|
|
||||||
|
|
||||||
@dict_search.post("/search/proverb/list")
|
@dict_search.post("/search/proverb/list")
|
||||||
async def search_proverb_list(query_word: ProverbSearchRequest):
|
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
|
||||||
lang = service.detect_language(text=query_word.query)
|
lang = service.detect_language(text=query_word.query)[1]
|
||||||
query = normalize_text(query_word.query) if lang == "fr" else query_word.query
|
query = normalize_text(query_word.query) if lang == "fr" else query_word.query
|
||||||
suggest_proverbs = await service.suggest_proverb(
|
suggest_proverbs = await service.suggest_proverb(
|
||||||
query=query_word.query,
|
query=query_word.query,
|
||||||
|
|
@ -174,10 +175,40 @@ async def search_proverb_list(query_word: ProverbSearchRequest):
|
||||||
model=ProverbFr,
|
model=ProverbFr,
|
||||||
search_field="search_text",
|
search_field="search_text",
|
||||||
)
|
)
|
||||||
# TODO 使用法语词典时是否存在用英语输入的情况
|
|
||||||
return {"list": suggest_proverbs}
|
return {"list": suggest_proverbs}
|
||||||
|
|
||||||
|
|
||||||
@dict_search.post("/search/proverb")
|
@dict_search.post("/search/proverb")
|
||||||
async def search_proverb(proverb_id:int = Form(...), user=Depends(get_current_user)):
|
async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)):
|
||||||
result = await service.accurate_proverb(proverb_id=proverb_id)
|
result = await service.accurate_proverb(proverb_id=proverb_id)
|
||||||
|
|
||||||
|
return {"result": result}
|
||||||
|
|
||||||
|
|
||||||
|
@dict_search.post("/search/idiom/list")
|
||||||
|
async def search_idiom_list(query_idiom: ProverbSearchRequest):
|
||||||
|
if query_idiom.dict_language == "fr":
|
||||||
|
raise HTTPException(status_code=400, detail="Dict language Error")
|
||||||
|
trad_query, lang = service.detect_language(text=query_idiom.query)
|
||||||
|
query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query
|
||||||
|
result = await service.suggest_proverb(
|
||||||
|
query=query,
|
||||||
|
lang=lang,
|
||||||
|
model=IdiomJp,
|
||||||
|
search_field="search_text",
|
||||||
|
target_field="text",
|
||||||
|
)
|
||||||
|
if lang == "zh":
|
||||||
|
trad_query = all_in_kana(text=query_idiom.query)
|
||||||
|
search_idioms_from_chi = await service.suggest_proverb(
|
||||||
|
query=trad_query,
|
||||||
|
lang="jp",
|
||||||
|
model=IdiomJp,
|
||||||
|
)
|
||||||
|
result[:0] = search_idioms_from_chi
|
||||||
|
return {"list": result}
|
||||||
|
|
||||||
|
@dict_search.post("/search/idiom")
|
||||||
|
async def search_idiom(query_id: int):
|
||||||
|
result = await accurate_proverb(proverb_id=query_id)
|
||||||
return {"result": result}
|
return {"result": result}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import re
|
||||||
from typing import List, Tuple, Dict, Literal, Type
|
from typing import List, Tuple, Dict, Literal, Type
|
||||||
|
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
from opencc import OpenCC
|
||||||
from tortoise import Tortoise, Model
|
from tortoise import Tortoise, Model
|
||||||
from tortoise.expressions import Q
|
from tortoise.expressions import Q
|
||||||
|
|
||||||
|
|
@ -13,18 +14,50 @@ from app.utils.textnorm import normalize_text
|
||||||
from settings import TORTOISE_ORM
|
from settings import TORTOISE_ORM
|
||||||
|
|
||||||
|
|
||||||
def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]:
|
def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]:
|
||||||
"""
|
"""
|
||||||
自动检测输入语言:
|
自动检测输入语言:
|
||||||
返回 'zh' / 'jp' / 'fr' / 'other'
|
- zh: 简体中文
|
||||||
|
- jp: 日语(含假名或繁体/旧体字)
|
||||||
|
- fr: 拉丁字母(法语等)
|
||||||
|
- other: 其他
|
||||||
"""
|
"""
|
||||||
|
cc_s2t = OpenCC('s2t') # 简体 → 繁体
|
||||||
|
cc_t2s = OpenCC('t2s') # 繁体 → 简体
|
||||||
|
|
||||||
|
JAPANESE_HIRAGANA = r"[\u3040-\u309F]"
|
||||||
|
JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]"
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return "", "other"
|
||||||
|
|
||||||
|
# ✅ Step 1: 假名检测
|
||||||
|
if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text):
|
||||||
|
return text, "jp"
|
||||||
|
|
||||||
|
# ✅ Step 2: 汉字检测
|
||||||
if re.search(r"[\u4e00-\u9fff]", text):
|
if re.search(r"[\u4e00-\u9fff]", text):
|
||||||
return "zh"
|
# 简繁互转对比
|
||||||
elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围
|
to_trad = cc_s2t.convert(text)
|
||||||
return "jp"
|
to_simp = cc_t2s.convert(text)
|
||||||
elif re.search(r"[a-zA-ZÀ-ÿ]", text):
|
|
||||||
return "fr"
|
# 如果输入等于繁体转换结果 → 繁体或日文汉字
|
||||||
return "other"
|
if text == to_trad and text != to_simp:
|
||||||
|
return text, "jp"
|
||||||
|
# 如果输入等于简体转换结果 → 简体中文
|
||||||
|
elif text == to_simp and text != to_trad:
|
||||||
|
return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索
|
||||||
|
# 否则混合(既有简体又有繁体)
|
||||||
|
else:
|
||||||
|
# 混合时可优先认定为繁体(日语)
|
||||||
|
return to_trad, "jp"
|
||||||
|
|
||||||
|
# ✅ Step 3: 拉丁字母检测
|
||||||
|
if re.search(r"[a-zA-ZÀ-ÿ]", text):
|
||||||
|
return text, "fr"
|
||||||
|
|
||||||
|
return text, "other"
|
||||||
|
|
||||||
|
|
||||||
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
|
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
|
||||||
|
|
@ -32,11 +65,21 @@ async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
|
||||||
proverb = await ProverbFr.get_or_none(id=proverb_id)
|
proverb = await ProverbFr.get_or_none(id=proverb_id)
|
||||||
if not proverb:
|
if not proverb:
|
||||||
raise HTTPException(status_code=404, detail="Proverb not found")
|
raise HTTPException(status_code=404, detail="Proverb not found")
|
||||||
|
proverb.freq = proverb.freq + 1
|
||||||
|
await proverb.save()
|
||||||
return ProverbSearchResponse(
|
return ProverbSearchResponse(
|
||||||
proverb_text=proverb.text,
|
proverb_text=proverb.text,
|
||||||
chi_exp=proverb.chi_exp,
|
chi_exp=proverb.chi_exp,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def accurate_idiom(idiom_id: int):
|
||||||
|
proverb = await ProverbFr.get_or_none(id=idiom_id)
|
||||||
|
if not proverb:
|
||||||
|
raise HTTPException(status_code=404, detail="Proverb not found")
|
||||||
|
proverb.freq = proverb.freq + 1
|
||||||
|
await proverb.save()
|
||||||
|
return proverb
|
||||||
|
|
||||||
|
|
||||||
async def suggest_proverb(
|
async def suggest_proverb(
|
||||||
query: str,
|
query: str,
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,4 @@ from . import signals
|
||||||
from .base import User
|
from .base import User
|
||||||
from .comments import CommentFr, CommentJp
|
from .comments import CommentFr, CommentJp
|
||||||
from .fr import WordlistFr, DefinitionFr, AttachmentFr, PronunciationTestFr
|
from .fr import WordlistFr, DefinitionFr, AttachmentFr, PronunciationTestFr
|
||||||
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp
|
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType, PronunciationTestJp, IdiomJp, KangjiMapping
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,18 @@ class IdiomJp(Model):
|
||||||
chi_exp = fields.TextField(null=False)
|
chi_exp = fields.TextField(null=False)
|
||||||
example = fields.TextField(null=False)
|
example = fields.TextField(null=False)
|
||||||
search_text = fields.TextField(null=False)
|
search_text = fields.TextField(null=False)
|
||||||
|
freq = fields.IntField(defualt=0, null=False)
|
||||||
created_at = fields.DatetimeField(auto_now_add=True)
|
created_at = fields.DatetimeField(auto_now_add=True)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
table = "idiom_jp"
|
table = "idiom_jp"
|
||||||
|
|
||||||
|
class KangjiMapping(Model):
|
||||||
|
id = fields.IntField(pk=True)
|
||||||
|
hanzi= fields.TextField(null=False)
|
||||||
|
kangji= fields.TextField(null=False)
|
||||||
|
note= fields.TextField(null=False)
|
||||||
|
created_at = fields.DatetimeField(auto_now_add=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "kangji_mapping_zh_jp"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tortoise import Tortoise
|
||||||
|
|
||||||
|
from app.models import KangjiMapping
|
||||||
|
from settings import TORTOISE_ORM
|
||||||
|
|
||||||
|
|
||||||
|
class JapaneseIntro:
|
||||||
|
kangji_mapping : Path = Path("./中日汉字映射表_自动扩充_约3000条.xlsx")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def kangji_mapping_intro(cls):
|
||||||
|
df = pd.read_excel(cls.kangji_mapping)
|
||||||
|
df.columns = [col.strip() for col in df.columns]
|
||||||
|
|
||||||
|
for row in df.itertuples():
|
||||||
|
hanzi = row[1]
|
||||||
|
kangji = row[2]
|
||||||
|
note = row[4]
|
||||||
|
|
||||||
|
mapping = await KangjiMapping.create(
|
||||||
|
hanzi=hanzi,
|
||||||
|
kangji=kangji,
|
||||||
|
note=note,
|
||||||
|
)
|
||||||
|
print("导入完成")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
await Tortoise.init(config=TORTOISE_ORM)
|
||||||
|
await KangjiMapping.all().delete()
|
||||||
|
await JapaneseIntro.kangji_mapping_intro()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -33,6 +33,7 @@ jaconv==0.4.0
|
||||||
jiter==0.11.1
|
jiter==0.11.1
|
||||||
numpy==2.3.2
|
numpy==2.3.2
|
||||||
openai==2.6.1
|
openai==2.6.1
|
||||||
|
opencc-python-reimplemented==0.1.7
|
||||||
openpyxl==3.1.5
|
openpyxl==3.1.5
|
||||||
pandas==2.3.1
|
pandas==2.3.1
|
||||||
pandas-stubs==2.3.2.250827
|
pandas-stubs==2.3.2.250827
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
import io
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
url = "https://www.unicode.org/Public/15.1.0/ucd/Unihan.zip"
|
||||||
|
print("📦 正在下载 Unihan 数据包...")
|
||||||
|
r = requests.get(url)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
|
||||||
|
txt = z.read("Unihan_Variants.txt").decode("utf-8") + \
|
||||||
|
"\n" + z.read("Unihan_Readings.txt").decode("utf-8")
|
||||||
|
|
||||||
|
print("✅ 数据加载成功")
|
||||||
|
|
||||||
|
# --- 匹配所需字段 ---
|
||||||
|
re_simpl = re.compile(r"U\+([0-9A-F]+)\tkSimplifiedVariant\t(U\+[0-9A-F]+)")
|
||||||
|
re_zvar = re.compile(r"U\+([0-9A-F]+)\tkZVariant\t(U\+[0-9A-F]+)")
|
||||||
|
re_jp_on = re.compile(r"U\+([0-9A-F]+)\tkJapaneseOn\t(.+)")
|
||||||
|
re_jp_kun = re.compile(r"U\+([0-9A-F]+)\tkJapaneseKun\t(.+)")
|
||||||
|
|
||||||
|
simpl_map, zvar_map, jp_on, jp_kun = {}, {}, {}, {}
|
||||||
|
|
||||||
|
for m in re_simpl.finditer(txt):
|
||||||
|
trad_hex, simp_hex = m.groups()
|
||||||
|
trad, simp = chr(int(trad_hex, 16)), chr(int(simp_hex, 16))
|
||||||
|
simpl_map[trad] = simp
|
||||||
|
|
||||||
|
for m in re_zvar.finditer(txt):
|
||||||
|
base_hex, var_hex = m.groups()
|
||||||
|
base, var = chr(int(base_hex, 16)), chr(int(var_hex, 16))
|
||||||
|
zvar_map[base] = var
|
||||||
|
|
||||||
|
for m in re_jp_on.finditer(txt):
|
||||||
|
code_hex, reading = m.groups()
|
||||||
|
char = chr(int(code_hex, 16))
|
||||||
|
jp_on[char] = reading.replace(" ", "、")
|
||||||
|
|
||||||
|
for m in re_jp_kun.finditer(txt):
|
||||||
|
code_hex, reading = m.groups()
|
||||||
|
char = chr(int(code_hex, 16))
|
||||||
|
jp_kun[char] = reading.replace(" ", "、")
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for trad, simp in simpl_map.items():
|
||||||
|
# 关键:找繁体→日语新字体的异体关系
|
||||||
|
if trad in zvar_map:
|
||||||
|
jp_char = zvar_map[trad]
|
||||||
|
if jp_char in jp_on or jp_char in jp_kun:
|
||||||
|
kana_on = jp_on.get(jp_char, "")
|
||||||
|
kana_kun = jp_kun.get(jp_char, "")
|
||||||
|
kana = kana_on + ("、" + kana_kun if kana_on and kana_kun else kana_kun)
|
||||||
|
rows.append([simp, trad, jp_char, kana, "是", "由繁体→简体+异体→日语新字体推导"])
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows, columns=["简体汉字", "繁体汉字", "日语汉字", "假名读音", "是否异体", "备注"])
|
||||||
|
df.to_excel("中日汉字映射表_六列综合版.xlsx", index=False)
|
||||||
|
|
||||||
|
print(f"✅ 已生成文件,共 {len(df)} 条记录。")
|
||||||
Loading…
Reference in New Issue