From d7658db3e8362a26463395d25733a4efc2ef2560 Mon Sep 17 00:00:00 2001 From: Miyamizu-MitsuhaSang <2510681107@qq.com> Date: Mon, 3 Nov 2025 17:29:49 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E8=B0=9A=E8=AF=AD=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=EF=BC=8C=E8=B0=83=E6=95=B4=E6=90=9C=E7=B4=A2=E5=87=BD?= =?UTF-8?q?=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/api/search_dict/routes.py | 76 +++++++++--------- app/api/search_dict/service.py | 137 +++++++++++++-------------------- app/models/jp.py | 23 ++++-- main.py | 4 +- 4 files changed, 114 insertions(+), 126 deletions(-) diff --git a/app/api/search_dict/routes.py b/app/api/search_dict/routes.py index 7ebc442..840dd11 100644 --- a/app/api/search_dict/routes.py +++ b/app/api/search_dict/routes.py @@ -1,3 +1,4 @@ +import asyncio from typing import Literal, List from fastapi import APIRouter, Depends, HTTPException, Request, Form @@ -136,23 +137,10 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current ) -@dict_search.post("/search/proverb") -async def proverb(request: Request, proverb_id: int, user=Depends(get_current_user)): - """ - 用于法语谚语搜索 - :param request: - :param body: 要求用户输入的内容必须为法语 - :param user: - :return: - """ - content = await service.accurate_proverb(proverb_id=proverb_id) - return content - - # TODO 相关度排序(转换为模糊匹配) # TODO 输入搜索框时反馈内容 -@dict_search.post("/search/word/list") +@dict_search.post("/search/list/word") async def search_word_list(query_word: SearchRequest, user=Depends(get_current_user)): """ 检索时的提示接口 @@ -165,9 +153,9 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u return {"list": word_contents} -@dict_search.post("/search/proverb/list") +@dict_search.post("/search/list/proverb") async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)): - lang = service.detect_language(text=query_word.query)[1] + query, lang, _ = service.detect_language(text=query_word.query) query = normalize_text(query_word.query) if lang == "fr" else query_word.query suggest_proverbs = await service.suggest_proverb( query=query_word.query, @@ -180,35 +168,53 @@ async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get @dict_search.post("/search/proverb") async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)): - result = await service.accurate_proverb(proverb_id=proverb_id) + result = await service.accurate_idiom_proverb(search_id=proverb_id, model=ProverbFr, only_fields=["text", "chi_exp"]) return {"result": result} -@dict_search.post("/search/idiom/list") -async def search_idiom_list(query_idiom: ProverbSearchRequest): +@dict_search.post("/search/list/idiom") +async def search_idiom_list(query_idiom: ProverbSearchRequest, user=Depends(get_current_user)): if query_idiom.dict_language == "fr": raise HTTPException(status_code=400, detail="Dict language Error") - trad_query, lang = service.detect_language(text=query_idiom.query) + + mapping_query, lang, is_kangji = await service.detect_language(text=query_idiom.query) query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query - result = await service.suggest_proverb( - query=query, - lang=lang, - model=IdiomJp, - search_field="search_text", - target_field="text", - ) - if lang == "zh": - trad_query = all_in_kana(text=query_idiom.query) - search_idioms_from_chi = await service.suggest_proverb( - query=trad_query, - lang="jp", + + # ✅ 并发任务列表 + tasks = [ + service.suggest_proverb( + query=query, + lang=lang, model=IdiomJp, + search_field="search_text", + target_field="text", ) - result[:0] = search_idioms_from_chi + ] + + if lang == "zh" and is_kangji: + # jp_query = all_in_kana(text=query_idiom.query) + tasks.append( + service.suggest_proverb( + query=mapping_query, + lang="jp", + model=IdiomJp, + search_field="text", + ) + ) + + # ✅ 并发执行(返回结果顺序与任务顺序一致) + results = await asyncio.gather(*tasks) + + # ✅ 合并结果 + result = results[0] + if len(results) > 1: + result[:0] = results[1] # 将中文映射查询结果插到最前面 + return {"list": result} + @dict_search.post("/search/idiom") -async def search_idiom(query_id: int): - result = await accurate_proverb(proverb_id=query_id) +async def search_idiom(query_id: int, user=Depends(get_current_user)): + result = await service.accurate_idiom_proverb(search_id=query_id, model=IdiomJp, only_fields=["id", "text", "search_text", "chi_exp", "example"]) return {"result": result} diff --git a/app/api/search_dict/service.py b/app/api/search_dict/service.py index 5543252..df625cf 100644 --- a/app/api/search_dict/service.py +++ b/app/api/search_dict/service.py @@ -2,83 +2,70 @@ import re from typing import List, Tuple, Dict, Literal, Type from fastapi import HTTPException -from opencc import OpenCC from tortoise import Tortoise, Model from tortoise.expressions import Q -from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest -from app.models import WordlistFr, WordlistJp -from app.models.fr import ProverbFr +from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchRequest +from app.models import WordlistFr, WordlistJp, KangjiMapping from app.utils.all_kana import all_in_kana from app.utils.textnorm import normalize_text from settings import TORTOISE_ORM -def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]: +async def detect_language(text: str) -> Tuple[str, str, bool]: """ 自动检测输入语言: - zh: 简体中文 - - jp: 日语(含假名或繁体/旧体字) + - jp: 日语(含假名或旧字体) - fr: 拉丁字母(法语等) - other: 其他 - """ - cc_s2t = OpenCC('s2t') # 简体 → 繁体 - cc_t2s = OpenCC('t2s') # 繁体 → 简体 + 返回: + (映射或原文本, 语言代码, 是否为“含汉字且命中映射表”的情况) + """ JAPANESE_HIRAGANA = r"[\u3040-\u309F]" JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]" text = text.strip() if not text: - return "", "other" + return "", "other", False - # ✅ Step 1: 假名检测 - if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text): - return text, "jp" + # ✅ Step 1: 全部假名(无汉字) + if re.fullmatch(f"(?:{JAPANESE_HIRAGANA}|{JAPANESE_KATAKANA})+", text): + return text, "jp", False # ✅ Step 2: 汉字检测 if re.search(r"[\u4e00-\u9fff]", text): - # 简繁互转对比 - to_trad = cc_s2t.convert(text) - to_simp = cc_t2s.convert(text) + # 优先判断是否为日语汉字 + jp_match = await KangjiMapping.get_or_none(kangji=text).only("kangji") + if jp_match: + return text, "jp", True # 含汉字且命中日语列 - # 如果输入等于繁体转换结果 → 繁体或日文汉字 - if text == to_trad and text != to_simp: - return text, "jp" - # 如果输入等于简体转换结果 → 简体中文 - elif text == to_simp and text != to_trad: - return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索 - # 否则混合(既有简体又有繁体) - else: - # 混合时可优先认定为繁体(日语) - return to_trad, "jp" + # 再检查是否为中文汉字 + zh_match = await KangjiMapping.get_or_none(hanzi=text).only("hanzi", "kangji") + if zh_match: + return zh_match.kangji, "zh", True # 含汉字且命中中文列 - # ✅ Step 3: 拉丁字母检测 + # 若都不在映射表中,则为未映射的中文 + return text, "zh", False + + # ✅ Step 3: 拉丁字母检测(如法语) if re.search(r"[a-zA-ZÀ-ÿ]", text): - return text, "fr" + return text, "fr", False - return text, "other" + # ✅ Step 4: 其他情况(符号、空格等) + return text, "other", False -async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: - """对于查询法语谚语的精准查询,返回详细信息""" - proverb = await ProverbFr.get_or_none(id=proverb_id) - if not proverb: - raise HTTPException(status_code=404, detail="Proverb not found") - proverb.freq = proverb.freq + 1 - await proverb.save() - return ProverbSearchResponse( - proverb_text=proverb.text, - chi_exp=proverb.chi_exp, - ) - -async def accurate_idiom(idiom_id: int): - proverb = await ProverbFr.get_or_none(id=idiom_id) - if not proverb: - raise HTTPException(status_code=404, detail="Proverb not found") - proverb.freq = proverb.freq + 1 - await proverb.save() - return proverb +async def accurate_idiom_proverb(search_id: int, model: Type[Model], only_fields: List[str] = None): + if "freq" not in only_fields: + only_fields.append("freq") + result = await model.get_or_none(id=search_id).only(*only_fields) + if not result: + raise HTTPException(status_code=404, detail="Target not found") + result.freq = result.freq + 1 + await result.save(update_fields=["freq"]) + return result async def suggest_proverb( @@ -90,54 +77,37 @@ async def suggest_proverb( chi_exp_field: str = "chi_exp", limit: int = 10, ) -> List[Dict[str, str]]: - """ - 通用搜索建议函数,用于多语言谚语表。 - 参数: - query: 搜索关键词 - lang: 'fr' 或 'zh' - model: Tortoise ORM 模型类,例如 ProverbFr - proverb_field: 外语谚语字段名 - chi_exp_field: 中文释义字段名 - limit: 每类匹配的最大返回数量 - - 搜索逻辑: - 1. 根据语言选择搜索字段; - 2. 优先匹配以输入开头的结果; - 3. 其次匹配包含输入但非开头的结果; - 4. 合并去重后返回。 - """ keyword = query.strip() if not keyword: return [] - # ✅ 根据语言选择搜索字段 + # ✅ 搜索条件:中文时双字段联合匹配 if lang == "zh": - startswith_field = f"{chi_exp_field}__istartswith" - contains_field = f"{chi_exp_field}__icontains" + start_condition = Q(**{f"{chi_exp_field}__istartswith": keyword}) | Q( + **{f"{search_field}__istartswith": keyword}) + contain_condition = Q(**{f"{chi_exp_field}__icontains": keyword}) | Q(**{f"{search_field}__icontains": keyword}) else: - startswith_field = f"{search_field}__istartswith" - contains_field = f"{search_field}__icontains" + start_condition = Q(**{f"{search_field}__istartswith": keyword}) + contain_condition = Q(**{f"{search_field}__icontains": keyword}) # ✅ 1. 开头匹配 start_matches = await ( - model.filter(**{startswith_field: keyword}) - .order_by("-freq") + model.filter(start_condition) + .order_by("-freq", "id") .limit(limit) - .values("id", target_field, search_field, chi_exp_field) + .values("id", target_field, chi_exp_field, "search_text") ) - # ✅ 2. 包含匹配(非开头) + # ✅ 2. 包含匹配(但不是开头) contain_matches = await ( - model.filter( - Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword}) - ) - .order_by("-freq") + model.filter(contain_condition & ~start_condition) + .order_by("-freq", "id") .limit(limit) - .values("id", target_field, search_field, chi_exp_field) + .values("id", target_field, chi_exp_field, "search_text") ) - # ✅ 3. 合并去重并保持顺序 - results: List[Dict[str, str]] = [] + # ✅ 3. 合并去重保持顺序 + results = [] seen_ids = set() for row in start_matches + contain_matches: if row["id"] not in seen_ids: @@ -145,11 +115,12 @@ async def suggest_proverb( results.append({ "id": row["id"], "proverb": row[target_field], - "search_text": row[search_field], - "chi_exp": row[chi_exp_field] + "search_text": row["search_text"], + "chi_exp": row[chi_exp_field], }) - return results + # ✅ 截断最终返回数量 + return results[:limit] async def suggest_autocomplete(query: SearchRequest, limit: int = 10): diff --git a/app/models/jp.py b/app/models/jp.py index ee166a2..356c680 100644 --- a/app/models/jp.py +++ b/app/models/jp.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Tuple, TypeVar +from typing import Tuple, TypeVar, Optional import pandas as pd from tortoise import fields @@ -18,8 +18,8 @@ class WordlistJp(Model): text = fields.CharField(max_length=40, description="单词") hiragana = fields.CharField(max_length=60, description="假名", null=False) freq = fields.IntField(default=0) - definitions : fields.ReverseRelation["DefinitionJp"] - attachments : fields.ReverseRelation["AttachmentJp"] + definitions: fields.ReverseRelation["DefinitionJp"] + attachments: fields.ReverseRelation["AttachmentJp"] class Meta: table = "wordlist_jp" @@ -74,6 +74,7 @@ class DefinitionJp(Model): class Meta: table = "definitions_jp" + class PosType(Model): id = fields.IntField(pk=True) pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False) @@ -81,6 +82,7 @@ class PosType(Model): class Meta: table = "pos_type" + class PronunciationTestJp(Model): id = fields.IntField(pk=True) text = fields.TextField(description="朗读文段") @@ -88,6 +90,7 @@ class PronunciationTestJp(Model): class Meta: table = "pronunciationtest_jp" + class IdiomJp(Model): id = fields.IntField(pk=True) text = fields.TextField(null=False) @@ -100,12 +103,20 @@ class IdiomJp(Model): class Meta: table = "idiom_jp" + class KangjiMapping(Model): id = fields.IntField(pk=True) - hanzi= fields.TextField(null=False) - kangji= fields.TextField(null=False) - note= fields.TextField(null=False) + hanzi = fields.TextField(null=False) + kangji = fields.TextField(null=False) + note = fields.TextField(null=False) created_at = fields.DatetimeField(auto_now_add=True) + @classmethod + async def chi2kangji(text_chi: str) -> Optional[str]: + mapping = await KangjiMapping.get_or_none(hanzi=text_chi) + if not mapping: + return None + return mapping.kangji + class Meta: table = "kangji_mapping_zh_jp" diff --git a/main.py b/main.py index ec10dc8..8b6d2f6 100644 --- a/main.py +++ b/main.py @@ -18,7 +18,7 @@ from app.api.user.routes import users_router from app.api.word_comment.routes import word_comment_router from app.core.redis import init_redis, close_redis from app.utils.phone_encrypt import PhoneEncrypt -from settings import TORTOISE_ORM +from settings import ONLINE_SETTINGS @asynccontextmanager @@ -46,7 +46,7 @@ app.add_middleware( register_tortoise( app=app, - config=TORTOISE_ORM, + config=ONLINE_SETTINGS, ) app.include_router(users_router, tags=["User API"], prefix="/users")