更新谚语搜索,调整搜索函数

This commit is contained in:
Miyamizu-MitsuhaSang 2025-11-03 17:29:49 +08:00
parent 2edd3e7a56
commit d7658db3e8
4 changed files with 114 additions and 126 deletions

View File

@ -1,3 +1,4 @@
import asyncio
from typing import Literal, List from typing import Literal, List
from fastapi import APIRouter, Depends, HTTPException, Request, Form from fastapi import APIRouter, Depends, HTTPException, Request, Form
@ -136,23 +137,10 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current
) )
@dict_search.post("/search/proverb")
async def proverb(request: Request, proverb_id: int, user=Depends(get_current_user)):
"""
用于法语谚语搜索
:param request:
:param body: 要求用户输入的内容必须为法语
:param user:
:return:
"""
content = await service.accurate_proverb(proverb_id=proverb_id)
return content
# TODO 相关度排序(转换为模糊匹配) # TODO 相关度排序(转换为模糊匹配)
# TODO 输入搜索框时反馈内容 # TODO 输入搜索框时反馈内容
@dict_search.post("/search/word/list") @dict_search.post("/search/list/word")
async def search_word_list(query_word: SearchRequest, user=Depends(get_current_user)): async def search_word_list(query_word: SearchRequest, user=Depends(get_current_user)):
""" """
检索时的提示接口 检索时的提示接口
@ -165,9 +153,9 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
return {"list": word_contents} return {"list": word_contents}
@dict_search.post("/search/proverb/list") @dict_search.post("/search/list/proverb")
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)): async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
lang = service.detect_language(text=query_word.query)[1] query, lang, _ = service.detect_language(text=query_word.query)
query = normalize_text(query_word.query) if lang == "fr" else query_word.query query = normalize_text(query_word.query) if lang == "fr" else query_word.query
suggest_proverbs = await service.suggest_proverb( suggest_proverbs = await service.suggest_proverb(
query=query_word.query, query=query_word.query,
@ -180,35 +168,53 @@ async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get
@dict_search.post("/search/proverb") @dict_search.post("/search/proverb")
async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)): async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)):
result = await service.accurate_proverb(proverb_id=proverb_id) result = await service.accurate_idiom_proverb(search_id=proverb_id, model=ProverbFr, only_fields=["text", "chi_exp"])
return {"result": result} return {"result": result}
@dict_search.post("/search/idiom/list") @dict_search.post("/search/list/idiom")
async def search_idiom_list(query_idiom: ProverbSearchRequest): async def search_idiom_list(query_idiom: ProverbSearchRequest, user=Depends(get_current_user)):
if query_idiom.dict_language == "fr": if query_idiom.dict_language == "fr":
raise HTTPException(status_code=400, detail="Dict language Error") raise HTTPException(status_code=400, detail="Dict language Error")
trad_query, lang = service.detect_language(text=query_idiom.query)
mapping_query, lang, is_kangji = await service.detect_language(text=query_idiom.query)
query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query
result = await service.suggest_proverb(
# ✅ 并发任务列表
tasks = [
service.suggest_proverb(
query=query, query=query,
lang=lang, lang=lang,
model=IdiomJp, model=IdiomJp,
search_field="search_text", search_field="search_text",
target_field="text", target_field="text",
) )
if lang == "zh": ]
trad_query = all_in_kana(text=query_idiom.query)
search_idioms_from_chi = await service.suggest_proverb( if lang == "zh" and is_kangji:
query=trad_query, # jp_query = all_in_kana(text=query_idiom.query)
tasks.append(
service.suggest_proverb(
query=mapping_query,
lang="jp", lang="jp",
model=IdiomJp, model=IdiomJp,
search_field="text",
) )
result[:0] = search_idioms_from_chi )
# ✅ 并发执行(返回结果顺序与任务顺序一致)
results = await asyncio.gather(*tasks)
# ✅ 合并结果
result = results[0]
if len(results) > 1:
result[:0] = results[1] # 将中文映射查询结果插到最前面
return {"list": result} return {"list": result}
@dict_search.post("/search/idiom") @dict_search.post("/search/idiom")
async def search_idiom(query_id: int): async def search_idiom(query_id: int, user=Depends(get_current_user)):
result = await accurate_proverb(proverb_id=query_id) result = await service.accurate_idiom_proverb(search_id=query_id, model=IdiomJp, only_fields=["id", "text", "search_text", "chi_exp", "example"])
return {"result": result} return {"result": result}

View File

@ -2,83 +2,70 @@ import re
from typing import List, Tuple, Dict, Literal, Type from typing import List, Tuple, Dict, Literal, Type
from fastapi import HTTPException from fastapi import HTTPException
from opencc import OpenCC
from tortoise import Tortoise, Model from tortoise import Tortoise, Model
from tortoise.expressions import Q from tortoise.expressions import Q
from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchRequest
from app.models import WordlistFr, WordlistJp from app.models import WordlistFr, WordlistJp, KangjiMapping
from app.models.fr import ProverbFr
from app.utils.all_kana import all_in_kana from app.utils.all_kana import all_in_kana
from app.utils.textnorm import normalize_text from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM from settings import TORTOISE_ORM
def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]: async def detect_language(text: str) -> Tuple[str, str, bool]:
""" """
自动检测输入语言: 自动检测输入语言:
- zh: 简体中文 - zh: 简体中文
- jp: 日语含假名或繁体/ - jp: 日语含假名或旧字
- fr: 拉丁字母法语等 - fr: 拉丁字母法语等
- other: 其他 - other: 其他
"""
cc_s2t = OpenCC('s2t') # 简体 → 繁体
cc_t2s = OpenCC('t2s') # 繁体 → 简体
返回:
(映射或原文本, 语言代码, 是否为含汉字且命中映射表的情况)
"""
JAPANESE_HIRAGANA = r"[\u3040-\u309F]" JAPANESE_HIRAGANA = r"[\u3040-\u309F]"
JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]" JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]"
text = text.strip() text = text.strip()
if not text: if not text:
return "", "other" return "", "other", False
# ✅ Step 1: 假名检测 # ✅ Step 1: 全部假名(无汉字)
if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text): if re.fullmatch(f"(?:{JAPANESE_HIRAGANA}|{JAPANESE_KATAKANA})+", text):
return text, "jp" return text, "jp", False
# ✅ Step 2: 汉字检测 # ✅ Step 2: 汉字检测
if re.search(r"[\u4e00-\u9fff]", text): if re.search(r"[\u4e00-\u9fff]", text):
# 简繁互转对比 # 优先判断是否为日语汉字
to_trad = cc_s2t.convert(text) jp_match = await KangjiMapping.get_or_none(kangji=text).only("kangji")
to_simp = cc_t2s.convert(text) if jp_match:
return text, "jp", True # 含汉字且命中日语列
# 如果输入等于繁体转换结果 → 繁体或日文汉字 # 再检查是否为中文汉字
if text == to_trad and text != to_simp: zh_match = await KangjiMapping.get_or_none(hanzi=text).only("hanzi", "kangji")
return text, "jp" if zh_match:
# 如果输入等于简体转换结果 → 简体中文 return zh_match.kangji, "zh", True # 含汉字且命中中文列
elif text == to_simp and text != to_trad:
return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索
# 否则混合(既有简体又有繁体)
else:
# 混合时可优先认定为繁体(日语)
return to_trad, "jp"
# ✅ Step 3: 拉丁字母检测 # 若都不在映射表中,则为未映射的中文
return text, "zh", False
# ✅ Step 3: 拉丁字母检测(如法语)
if re.search(r"[a-zA-ZÀ-ÿ]", text): if re.search(r"[a-zA-ZÀ-ÿ]", text):
return text, "fr" return text, "fr", False
return text, "other" # ✅ Step 4: 其他情况(符号、空格等)
return text, "other", False
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: async def accurate_idiom_proverb(search_id: int, model: Type[Model], only_fields: List[str] = None):
"""对于查询法语谚语的精准查询,返回详细信息""" if "freq" not in only_fields:
proverb = await ProverbFr.get_or_none(id=proverb_id) only_fields.append("freq")
if not proverb: result = await model.get_or_none(id=search_id).only(*only_fields)
raise HTTPException(status_code=404, detail="Proverb not found") if not result:
proverb.freq = proverb.freq + 1 raise HTTPException(status_code=404, detail="Target not found")
await proverb.save() result.freq = result.freq + 1
return ProverbSearchResponse( await result.save(update_fields=["freq"])
proverb_text=proverb.text, return result
chi_exp=proverb.chi_exp,
)
async def accurate_idiom(idiom_id: int):
proverb = await ProverbFr.get_or_none(id=idiom_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
proverb.freq = proverb.freq + 1
await proverb.save()
return proverb
async def suggest_proverb( async def suggest_proverb(
@ -90,54 +77,37 @@ async def suggest_proverb(
chi_exp_field: str = "chi_exp", chi_exp_field: str = "chi_exp",
limit: int = 10, limit: int = 10,
) -> List[Dict[str, str]]: ) -> List[Dict[str, str]]:
"""
通用搜索建议函数用于多语言谚语表
参数:
query: 搜索关键词
lang: 'fr' 'zh'
model: Tortoise ORM 模型类例如 ProverbFr
proverb_field: 外语谚语字段名
chi_exp_field: 中文释义字段名
limit: 每类匹配的最大返回数量
搜索逻辑:
1. 根据语言选择搜索字段
2. 优先匹配以输入开头的结果
3. 其次匹配包含输入但非开头的结果
4. 合并去重后返回
"""
keyword = query.strip() keyword = query.strip()
if not keyword: if not keyword:
return [] return []
# ✅ 根据语言选择搜索字段 # ✅ 搜索条件:中文时双字段联合匹配
if lang == "zh": if lang == "zh":
startswith_field = f"{chi_exp_field}__istartswith" start_condition = Q(**{f"{chi_exp_field}__istartswith": keyword}) | Q(
contains_field = f"{chi_exp_field}__icontains" **{f"{search_field}__istartswith": keyword})
contain_condition = Q(**{f"{chi_exp_field}__icontains": keyword}) | Q(**{f"{search_field}__icontains": keyword})
else: else:
startswith_field = f"{search_field}__istartswith" start_condition = Q(**{f"{search_field}__istartswith": keyword})
contains_field = f"{search_field}__icontains" contain_condition = Q(**{f"{search_field}__icontains": keyword})
# ✅ 1. 开头匹配 # ✅ 1. 开头匹配
start_matches = await ( start_matches = await (
model.filter(**{startswith_field: keyword}) model.filter(start_condition)
.order_by("-freq") .order_by("-freq", "id")
.limit(limit) .limit(limit)
.values("id", target_field, search_field, chi_exp_field) .values("id", target_field, chi_exp_field, "search_text")
) )
# ✅ 2. 包含匹配(开头) # ✅ 2. 包含匹配(但不是开头)
contain_matches = await ( contain_matches = await (
model.filter( model.filter(contain_condition & ~start_condition)
Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword}) .order_by("-freq", "id")
)
.order_by("-freq")
.limit(limit) .limit(limit)
.values("id", target_field, search_field, chi_exp_field) .values("id", target_field, chi_exp_field, "search_text")
) )
# ✅ 3. 合并去重保持顺序 # ✅ 3. 合并去重保持顺序
results: List[Dict[str, str]] = [] results = []
seen_ids = set() seen_ids = set()
for row in start_matches + contain_matches: for row in start_matches + contain_matches:
if row["id"] not in seen_ids: if row["id"] not in seen_ids:
@ -145,11 +115,12 @@ async def suggest_proverb(
results.append({ results.append({
"id": row["id"], "id": row["id"],
"proverb": row[target_field], "proverb": row[target_field],
"search_text": row[search_field], "search_text": row["search_text"],
"chi_exp": row[chi_exp_field] "chi_exp": row[chi_exp_field],
}) })
return results # ✅ 截断最终返回数量
return results[:limit]
async def suggest_autocomplete(query: SearchRequest, limit: int = 10): async def suggest_autocomplete(query: SearchRequest, limit: int = 10):

View File

@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from typing import Tuple, TypeVar from typing import Tuple, TypeVar, Optional
import pandas as pd import pandas as pd
from tortoise import fields from tortoise import fields
@ -18,8 +18,8 @@ class WordlistJp(Model):
text = fields.CharField(max_length=40, description="单词") text = fields.CharField(max_length=40, description="单词")
hiragana = fields.CharField(max_length=60, description="假名", null=False) hiragana = fields.CharField(max_length=60, description="假名", null=False)
freq = fields.IntField(default=0) freq = fields.IntField(default=0)
definitions : fields.ReverseRelation["DefinitionJp"] definitions: fields.ReverseRelation["DefinitionJp"]
attachments : fields.ReverseRelation["AttachmentJp"] attachments: fields.ReverseRelation["AttachmentJp"]
class Meta: class Meta:
table = "wordlist_jp" table = "wordlist_jp"
@ -74,6 +74,7 @@ class DefinitionJp(Model):
class Meta: class Meta:
table = "definitions_jp" table = "definitions_jp"
class PosType(Model): class PosType(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False) pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False)
@ -81,6 +82,7 @@ class PosType(Model):
class Meta: class Meta:
table = "pos_type" table = "pos_type"
class PronunciationTestJp(Model): class PronunciationTestJp(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
text = fields.TextField(description="朗读文段") text = fields.TextField(description="朗读文段")
@ -88,6 +90,7 @@ class PronunciationTestJp(Model):
class Meta: class Meta:
table = "pronunciationtest_jp" table = "pronunciationtest_jp"
class IdiomJp(Model): class IdiomJp(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
text = fields.TextField(null=False) text = fields.TextField(null=False)
@ -100,12 +103,20 @@ class IdiomJp(Model):
class Meta: class Meta:
table = "idiom_jp" table = "idiom_jp"
class KangjiMapping(Model): class KangjiMapping(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
hanzi= fields.TextField(null=False) hanzi = fields.TextField(null=False)
kangji= fields.TextField(null=False) kangji = fields.TextField(null=False)
note= fields.TextField(null=False) note = fields.TextField(null=False)
created_at = fields.DatetimeField(auto_now_add=True) created_at = fields.DatetimeField(auto_now_add=True)
@classmethod
async def chi2kangji(text_chi: str) -> Optional[str]:
mapping = await KangjiMapping.get_or_none(hanzi=text_chi)
if not mapping:
return None
return mapping.kangji
class Meta: class Meta:
table = "kangji_mapping_zh_jp" table = "kangji_mapping_zh_jp"

View File

@ -18,7 +18,7 @@ from app.api.user.routes import users_router
from app.api.word_comment.routes import word_comment_router from app.api.word_comment.routes import word_comment_router
from app.core.redis import init_redis, close_redis from app.core.redis import init_redis, close_redis
from app.utils.phone_encrypt import PhoneEncrypt from app.utils.phone_encrypt import PhoneEncrypt
from settings import TORTOISE_ORM from settings import ONLINE_SETTINGS
@asynccontextmanager @asynccontextmanager
@ -46,7 +46,7 @@ app.add_middleware(
register_tortoise( register_tortoise(
app=app, app=app,
config=TORTOISE_ORM, config=ONLINE_SETTINGS,
) )
app.include_router(users_router, tags=["User API"], prefix="/users") app.include_router(users_router, tags=["User API"], prefix="/users")