更新谚语搜索,调整搜索函数

This commit is contained in:
Miyamizu-MitsuhaSang 2025-11-02 23:56:14 +08:00
parent 11ff892653
commit b16917215a
9 changed files with 106 additions and 53 deletions

View File

@ -8,7 +8,7 @@ from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse
from app.api.search_dict.service import suggest_autocomplete from app.api.search_dict.service import suggest_autocomplete
from app.api.word_comment.word_comment_schemas import CommentSet from app.api.word_comment.word_comment_schemas import CommentSet
from app.models import DefinitionJp, CommentFr, CommentJp from app.models import DefinitionJp, CommentFr, CommentJp
from app.models.fr import DefinitionFr from app.models.fr import DefinitionFr, ProverbFr
from app.utils.all_kana import all_in_kana from app.utils.all_kana import all_in_kana
from app.utils.security import get_current_user from app.utils.security import get_current_user
from app.utils.textnorm import normalize_text from app.utils.textnorm import normalize_text
@ -165,7 +165,12 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
@dict_search.post("/search/proverb/list") @dict_search.post("/search/proverb/list")
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)): async def search_proverb_list(query_word: ProverbSearchRequest):
lang: Literal['fr', 'zh'] = 'zh' if service.contains_chinese(query_word.query) else 'fr' lang = service.detect_language(text=query_word.query)
suggest_proverbs = await service.suggest_proverb(query=query_word, lang=lang) suggest_proverbs = await service.suggest_proverb(
query=query_word.query,
lang=lang,
model=ProverbFr,
)
# TODO 使用法语词典时是否存在用英语输入的情况
return {"list": suggest_proverbs} return {"list": suggest_proverbs}

View File

@ -13,7 +13,7 @@ class SearchRequest(BaseModel):
class ProverbSearchRequest(BaseModel): class ProverbSearchRequest(BaseModel):
query: str query: str
language: Literal['fr', 'jp'] = "fr" dict_language: Literal['fr', 'jp'] = "fr"
class SearchItemJp(BaseModel): class SearchItemJp(BaseModel):

View File

@ -1,9 +1,8 @@
import asyncio
import re import re
from typing import List, Tuple, Dict, Literal from typing import List, Tuple, Dict, Literal, Type
from fastapi import HTTPException from fastapi import HTTPException
from tortoise import Tortoise from tortoise import Tortoise, Model
from tortoise.expressions import Q from tortoise.expressions import Q
from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest
@ -14,75 +13,95 @@ from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM from settings import TORTOISE_ORM
def contains_chinese(text: str) -> bool: def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]:
"""判断字符串中是否包含至少一个中文字符""" """
return bool(re.search(r'[\u4e00-\u9fff]', text)) 自动检测输入语言:
返回 'zh' / 'jp' / 'fr' / 'other'
"""
if re.search(r"[\u4e00-\u9fff]", text):
return "zh"
elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围
return "jp"
elif re.search(r"[a-zA-ZÀ-ÿ]", text):
return "fr"
return "other"
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
"""对于查询法语谚语的精准查询,返回详细信息"""
proverb = await ProverbFr.get_or_none(id=proverb_id) proverb = await ProverbFr.get_or_none(id=proverb_id)
if not proverb: if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found") raise HTTPException(status_code=404, detail="Proverb not found")
return ProverbSearchResponse( return ProverbSearchResponse(
proverb_text=proverb.proverb, proverb_text=proverb.text,
chi_exp=proverb.chi_exp, chi_exp=proverb.chi_exp,
) )
async def suggest_proverb(query: ProverbSearchRequest, lang: Literal['fr', 'zh']) -> List[Dict[str, str]]: async def suggest_proverb(
query: str,
lang: Literal["fr", "zh", "jp"],
model: Type[Model],
proverb_field: str = "text",
chi_exp_field: str = "chi_exp",
limit: int = 10,
) -> List[Dict[str, str]]:
""" """
对法语谚语表进行搜索建议 通用搜索建议函数用于多语言谚语表
参数: 参数:
query.query: 搜索关键词 query: 搜索关键词
lang: 'fr' 'zh' lang: 'fr' 'zh'
逻辑: model: Tortoise ORM 模型类例如 ProverbFr
1. lang='fr'按谚语字段 (proverb) 搜索 proverb_field: 外语谚语字段名
2. lang='zh'按中文释义字段 (chi_exp) 搜索 chi_exp_field: 中文释义字段名
3. 优先以输入开头的匹配 limit: 每类匹配的最大返回数量
4. 其次为包含输入但不以其开头的匹配 freq 排序
:return: [{'id': 1, 'proverb': 'xxx'}, ...] 搜索逻辑:
1. 根据语言选择搜索字段
2. 优先匹配以输入开头的结果
3. 其次匹配包含输入但非开头的结果
4. 合并去重后返回
""" """
keyword = query.query.strip() keyword = query.strip()
results: List[Dict[str, str]] = []
if not keyword: if not keyword:
return results return []
# ✅ 根据语言决定搜索字段 # ✅ 根据语言选择搜索字段
if lang == "zh": if lang == "zh":
startswith_field = "chi_exp__istartswith" startswith_field = f"{chi_exp_field}__istartswith"
contains_field = "chi_exp__icontains" contains_field = f"{chi_exp_field}__icontains"
else: # 默认法语 else:
startswith_field = "proverb__istartswith" startswith_field = f"{proverb_field}__istartswith"
contains_field = "proverb__icontains" contains_field = f"{proverb_field}__icontains"
# ✅ 1. 开头匹配 # ✅ 1. 开头匹配
start_matches = await ( start_matches = await (
ProverbFr.filter(**{startswith_field: keyword}) model.filter(**{startswith_field: keyword})
.order_by("-freq") .order_by("-freq")
.limit(10) .limit(limit)
.values("id", "proverb", "chi_exp") .values("id", proverb_field, chi_exp_field)
) )
# ✅ 2. 包含匹配(但不是开头) # ✅ 2. 包含匹配(开头)
contain_matches = await ( contain_matches = await (
ProverbFr.filter( model.filter(
Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword}) Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword})
) )
.order_by("-freq") .order_by("-freq")
.limit(10) .limit(limit)
.values("id", "proverb", "chi_exp") .values("id", proverb_field, chi_exp_field)
) )
# ✅ 合并结果(去重并保持顺序) # ✅ 3. 合并去重并保持顺序
results: List[Dict[str, str]] = []
seen_ids = set() seen_ids = set()
for row in start_matches + contain_matches: for row in start_matches + contain_matches:
if row["id"] not in seen_ids: if row["id"] not in seen_ids:
seen_ids.add(row["id"]) seen_ids.add(row["id"])
results.append({ results.append({
"id": row["id"], "id": row["id"],
"proverb": row["proverb"], "proverb": row[proverb_field],
"chi_exp": row["chi_exp"] "chi_exp": row[chi_exp_field]
}) })
return results return results
@ -205,4 +224,5 @@ async def __main():
if __name__ == '__main__': if __name__ == '__main__':
asyncio.run(__main()) # asyncio.run(__main())
print(detect_language(text="ahsjdasd"))

View File

@ -8,8 +8,8 @@ from fastapi import APIRouter, Depends, HTTPException
from app.models import User from app.models import User
from app.schemas.trans_schemas import TransResponse, TransRequest from app.schemas.trans_schemas import TransResponse, TransRequest
from app.utils.md5 import make_md5
from app.utils.security import is_admin_user, get_current_user from app.utils.security import is_admin_user, get_current_user
from scripts.md5 import make_md5
from settings import settings from settings import settings
translator_router = APIRouter() translator_router = APIRouter()

View File

@ -45,11 +45,10 @@ class DefinitionFr(Model):
class ProverbFr(Model): class ProverbFr(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
proverb = fields.TextField(description="法语谚语及常用表达") text = fields.TextField(description="法语谚语及常用表达")
chi_exp = fields.TextField(description="中文释义") chi_exp = fields.TextField(description="中文释义")
freq = fields.IntField(default=0) freq = fields.IntField(default=0)
created_at = fields.DatetimeField(auto_now_add=True) created_at = fields.DatetimeField(auto_now_add=True)
updated_at = fields.DatetimeField(auto_now=True)
class Meta: class Meta:
table = "proverb_fr" table = "proverb_fr"

View File

@ -87,3 +87,14 @@ class PronunciationTestJp(Model):
class Meta: class Meta:
table = "pronunciationtest_jp" table = "pronunciationtest_jp"
class IdiomJp(Model):
id = fields.IntField(pk=True)
text = fields.TextField(null=False)
chi_exp = fields.TextField(null=False)
example = fields.TextField(null=False)
search_text = fields.TextField(null=False)
created_at = fields.DatetimeField(auto_now_add=True)
class Meta:
table = "idiom_jp"

View File

@ -18,7 +18,7 @@ from app.api.user.routes import users_router
from app.api.word_comment.routes import word_comment_router from app.api.word_comment.routes import word_comment_router
from app.core.redis import init_redis, close_redis from app.core.redis import init_redis, close_redis
from app.utils.phone_encrypt import PhoneEncrypt from app.utils.phone_encrypt import PhoneEncrypt
from settings import ONLINE_SETTINGS from settings import TORTOISE_ORM
@asynccontextmanager @asynccontextmanager
@ -46,7 +46,7 @@ app.add_middleware(
register_tortoise( register_tortoise(
app=app, app=app,
config=ONLINE_SETTINGS, config=TORTOISE_ORM,
) )
app.include_router(users_router, tags=["User API"], prefix="/users") app.include_router(users_router, tags=["User API"], prefix="/users")

View File

@ -1,21 +1,21 @@
import asyncio import asyncio
import re import re
import unicodedata import unicodedata
import jaconv from importlib import resources
from pathlib import Path from pathlib import Path
import jaconv
import pandas as pd import pandas as pd
from fugashi import Tagger from fugashi import Tagger
import unidic_lite
from importlib import resources
from pykakasi import kakasi from pykakasi import kakasi
from tortoise import Tortoise from tortoise import Tortoise
from tortoise.exceptions import MultipleObjectsReturned from tortoise.exceptions import MultipleObjectsReturned
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
from app.models.jp import IdiomJp
from settings import TORTOISE_ORM from settings import TORTOISE_ORM
xlsx_name = "./DictTable-20250823.xlsx" xlsx_name = "./DictTable_20251029.xlsx"
xlsx_path = Path(xlsx_name) xlsx_path = Path(xlsx_name)
@ -228,6 +228,24 @@ async def set_hiragana(xlsx_path: Path = xlsx_path, sheet_name : str="日汉释
await WordlistJp.filter(text=word).update(hiragana=hiragana) await WordlistJp.filter(text=word).update(hiragana=hiragana)
async def import_idiom():
path = xlsx_path
df = pd.read_excel(path, sheet_name="日语惯用语")
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
sentence = str(row[1]).strip()
search_text = str(row[2]).strip()
chi_exp = str(row[3]).strip()
example = str(row[4]).strip()
await IdiomJp.create(
text=sentence,
chi_exp=chi_exp,
example=example,
search_text=search_text,
)
async def main(): async def main():
await Tortoise.init(config=TORTOISE_ORM) await Tortoise.init(config=TORTOISE_ORM)
@ -237,8 +255,8 @@ async def main():
# await import_wordlist_jp() # await import_wordlist_jp()
# await import_def_jp() # await import_def_jp()
# await import_attachment() # await import_attachment()
await set_hiragana() # await set_hiragana()
await import_idiom()
if __name__ == '__main__': if __name__ == '__main__':
asyncio.run(main()) asyncio.run(main())