更新谚语搜索,调整搜索函数

This commit is contained in:
Miyamizu-MitsuhaSang 2025-11-02 23:56:14 +08:00
parent 11ff892653
commit b16917215a
9 changed files with 106 additions and 53 deletions

View File

@ -8,7 +8,7 @@ from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse
from app.api.search_dict.service import suggest_autocomplete
from app.api.word_comment.word_comment_schemas import CommentSet
from app.models import DefinitionJp, CommentFr, CommentJp
from app.models.fr import DefinitionFr
from app.models.fr import DefinitionFr, ProverbFr
from app.utils.all_kana import all_in_kana
from app.utils.security import get_current_user
from app.utils.textnorm import normalize_text
@ -165,7 +165,12 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
@dict_search.post("/search/proverb/list")
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
lang: Literal['fr', 'zh'] = 'zh' if service.contains_chinese(query_word.query) else 'fr'
suggest_proverbs = await service.suggest_proverb(query=query_word, lang=lang)
async def search_proverb_list(query_word: ProverbSearchRequest):
lang = service.detect_language(text=query_word.query)
suggest_proverbs = await service.suggest_proverb(
query=query_word.query,
lang=lang,
model=ProverbFr,
)
# TODO 使用法语词典时是否存在用英语输入的情况
return {"list": suggest_proverbs}

View File

@ -13,7 +13,7 @@ class SearchRequest(BaseModel):
class ProverbSearchRequest(BaseModel):
query: str
language: Literal['fr', 'jp'] = "fr"
dict_language: Literal['fr', 'jp'] = "fr"
class SearchItemJp(BaseModel):

View File

@ -1,9 +1,8 @@
import asyncio
import re
from typing import List, Tuple, Dict, Literal
from typing import List, Tuple, Dict, Literal, Type
from fastapi import HTTPException
from tortoise import Tortoise
from tortoise import Tortoise, Model
from tortoise.expressions import Q
from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest
@ -14,75 +13,95 @@ from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM
def contains_chinese(text: str) -> bool:
"""判断字符串中是否包含至少一个中文字符"""
return bool(re.search(r'[\u4e00-\u9fff]', text))
def detect_language(text: str) -> Literal["fr", "zh", "jp", "other"]:
"""
自动检测输入语言:
返回 'zh' / 'jp' / 'fr' / 'other'
"""
if re.search(r"[\u4e00-\u9fff]", text):
return "zh"
elif re.search(r"[\u3040-\u30ff\u31f0-\u31ff]", text): # 日文假名范围
return "jp"
elif re.search(r"[a-zA-ZÀ-ÿ]", text):
return "fr"
return "other"
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse:
"""对于查询法语谚语的精准查询,返回详细信息"""
proverb = await ProverbFr.get_or_none(id=proverb_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
return ProverbSearchResponse(
proverb_text=proverb.proverb,
proverb_text=proverb.text,
chi_exp=proverb.chi_exp,
)
async def suggest_proverb(query: ProverbSearchRequest, lang: Literal['fr', 'zh']) -> List[Dict[str, str]]:
async def suggest_proverb(
query: str,
lang: Literal["fr", "zh", "jp"],
model: Type[Model],
proverb_field: str = "text",
chi_exp_field: str = "chi_exp",
limit: int = 10,
) -> List[Dict[str, str]]:
"""
对法语谚语表进行搜索建议
通用搜索建议函数用于多语言谚语表
参数:
query.query: 搜索关键词
query: 搜索关键词
lang: 'fr' 'zh'
逻辑:
1. lang='fr'按谚语字段 (proverb) 搜索
2. lang='zh'按中文释义字段 (chi_exp) 搜索
3. 优先以输入开头的匹配
4. 其次为包含输入但不以其开头的匹配 freq 排序
:return: [{'id': 1, 'proverb': 'xxx'}, ...]
model: Tortoise ORM 模型类例如 ProverbFr
proverb_field: 外语谚语字段名
chi_exp_field: 中文释义字段名
limit: 每类匹配的最大返回数量
搜索逻辑:
1. 根据语言选择搜索字段
2. 优先匹配以输入开头的结果
3. 其次匹配包含输入但非开头的结果
4. 合并去重后返回
"""
keyword = query.query.strip()
results: List[Dict[str, str]] = []
keyword = query.strip()
if not keyword:
return results
return []
# ✅ 根据语言决定搜索字段
# ✅ 根据语言选择搜索字段
if lang == "zh":
startswith_field = "chi_exp__istartswith"
contains_field = "chi_exp__icontains"
else: # 默认法语
startswith_field = "proverb__istartswith"
contains_field = "proverb__icontains"
startswith_field = f"{chi_exp_field}__istartswith"
contains_field = f"{chi_exp_field}__icontains"
else:
startswith_field = f"{proverb_field}__istartswith"
contains_field = f"{proverb_field}__icontains"
# ✅ 1. 开头匹配
start_matches = await (
ProverbFr.filter(**{startswith_field: keyword})
model.filter(**{startswith_field: keyword})
.order_by("-freq")
.limit(10)
.values("id", "proverb", "chi_exp")
.limit(limit)
.values("id", proverb_field, chi_exp_field)
)
# ✅ 2. 包含匹配(但不是开头)
# ✅ 2. 包含匹配(开头)
contain_matches = await (
ProverbFr.filter(
model.filter(
Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword})
)
.order_by("-freq")
.limit(10)
.values("id", "proverb", "chi_exp")
.limit(limit)
.values("id", proverb_field, chi_exp_field)
)
# ✅ 合并结果(去重并保持顺序)
# ✅ 3. 合并去重并保持顺序
results: List[Dict[str, str]] = []
seen_ids = set()
for row in start_matches + contain_matches:
if row["id"] not in seen_ids:
seen_ids.add(row["id"])
results.append({
"id": row["id"],
"proverb": row["proverb"],
"chi_exp": row["chi_exp"]
"proverb": row[proverb_field],
"chi_exp": row[chi_exp_field]
})
return results
@ -205,4 +224,5 @@ async def __main():
if __name__ == '__main__':
asyncio.run(__main())
# asyncio.run(__main())
print(detect_language(text="ahsjdasd"))

View File

@ -8,8 +8,8 @@ from fastapi import APIRouter, Depends, HTTPException
from app.models import User
from app.schemas.trans_schemas import TransResponse, TransRequest
from app.utils.md5 import make_md5
from app.utils.security import is_admin_user, get_current_user
from scripts.md5 import make_md5
from settings import settings
translator_router = APIRouter()

View File

@ -45,11 +45,10 @@ class DefinitionFr(Model):
class ProverbFr(Model):
id = fields.IntField(pk=True)
proverb = fields.TextField(description="法语谚语及常用表达")
text = fields.TextField(description="法语谚语及常用表达")
chi_exp = fields.TextField(description="中文释义")
freq = fields.IntField(default=0)
created_at = fields.DatetimeField(auto_now_add=True)
updated_at = fields.DatetimeField(auto_now=True)
class Meta:
table = "proverb_fr"

View File

@ -87,3 +87,14 @@ class PronunciationTestJp(Model):
class Meta:
table = "pronunciationtest_jp"
class IdiomJp(Model):
id = fields.IntField(pk=True)
text = fields.TextField(null=False)
chi_exp = fields.TextField(null=False)
example = fields.TextField(null=False)
search_text = fields.TextField(null=False)
created_at = fields.DatetimeField(auto_now_add=True)
class Meta:
table = "idiom_jp"

View File

@ -18,7 +18,7 @@ from app.api.user.routes import users_router
from app.api.word_comment.routes import word_comment_router
from app.core.redis import init_redis, close_redis
from app.utils.phone_encrypt import PhoneEncrypt
from settings import ONLINE_SETTINGS
from settings import TORTOISE_ORM
@asynccontextmanager
@ -46,7 +46,7 @@ app.add_middleware(
register_tortoise(
app=app,
config=ONLINE_SETTINGS,
config=TORTOISE_ORM,
)
app.include_router(users_router, tags=["User API"], prefix="/users")

View File

@ -1,21 +1,21 @@
import asyncio
import re
import unicodedata
import jaconv
from importlib import resources
from pathlib import Path
import jaconv
import pandas as pd
from fugashi import Tagger
import unidic_lite
from importlib import resources
from pykakasi import kakasi
from tortoise import Tortoise
from tortoise.exceptions import MultipleObjectsReturned
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
from app.models.jp import IdiomJp
from settings import TORTOISE_ORM
xlsx_name = "./DictTable-20250823.xlsx"
xlsx_name = "./DictTable_20251029.xlsx"
xlsx_path = Path(xlsx_name)
@ -228,6 +228,24 @@ async def set_hiragana(xlsx_path: Path = xlsx_path, sheet_name : str="日汉释
await WordlistJp.filter(text=word).update(hiragana=hiragana)
async def import_idiom():
path = xlsx_path
df = pd.read_excel(path, sheet_name="日语惯用语")
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
sentence = str(row[1]).strip()
search_text = str(row[2]).strip()
chi_exp = str(row[3]).strip()
example = str(row[4]).strip()
await IdiomJp.create(
text=sentence,
chi_exp=chi_exp,
example=example,
search_text=search_text,
)
async def main():
await Tortoise.init(config=TORTOISE_ORM)
@ -237,8 +255,8 @@ async def main():
# await import_wordlist_jp()
# await import_def_jp()
# await import_attachment()
await set_hiragana()
# await set_hiragana()
await import_idiom()
if __name__ == '__main__':
asyncio.run(main())