Compare commits

...

3 Commits

Author SHA1 Message Date
Miyamizu-MitsuhaSang 8897680d24 更新谚语搜索,调整搜索函数 2025-11-03 17:53:36 +08:00
Miyamizu-MitsuhaSang 2608a27abc 更新谚语搜索,调整搜索函数 2025-11-03 17:43:33 +08:00
Miyamizu-MitsuhaSang d7658db3e8 更新谚语搜索,调整搜索函数 2025-11-03 17:29:49 +08:00
5 changed files with 184 additions and 133 deletions

View File

@ -344,8 +344,9 @@ Authorization: Bearer <your_jwt_token>
```json ```json
{ {
"result": { "result": {
"proverb_text": "Petit à petit, l'oiseau fait son nid.", "text": "Petit à petit, l'oiseau fait son nid.",
"chi_exp": "循序渐进才能取得成功。" "chi_exp": "循序渐进才能取得成功。",
"freq": 128
} }
} }
``` ```
@ -356,7 +357,7 @@ Authorization: Bearer <your_jwt_token>
#### 2.3 单词联想建议 #### 2.3 单词联想建议
- **接口**: `POST /search/word/list` - **接口**: `POST /search/list/word`
- **描述**: 根据用户输入返回单词联想列表,含前缀匹配与包含匹配。 - **描述**: 根据用户输入返回单词联想列表,含前缀匹配与包含匹配。
- **需要认证**: 是 - **需要认证**: 是
- **请求体**: - **请求体**:
@ -380,16 +381,20 @@ Authorization: Bearer <your_jwt_token>
> **说明**: `language = "jp"` 时返回形如 `[["愛", "あい"], ["愛する", "あいする"]]` 的二维数组,第二列为假名读音。 > **说明**: `language = "jp"` 时返回形如 `[["愛", "あい"], ["愛する", "あいする"]]` 的二维数组,第二列为假名读音。
- **状态码**:
- `200`: 查询成功
#### 2.4 谚语联想建议 #### 2.4 谚语联想建议
- **接口**: `POST /search/proverb/list` - **接口**: `POST /search/list/proverb`
- **描述**: 按输入内容返回谚语候选列表,后端会自动检测输入语言(中文/日文假名/拉丁字母),无法识别时退回法语字段搜索。 - **描述**: 按输入内容返回谚语候选列表,后端会自动检测输入语言(中文/日文假名/拉丁字母),无法识别时退回法语字段搜索。
- **需要认证**: 是 - **需要认证**: 是
- **请求体**: - **请求体**:
```json ```json
{ {
"query": "慢" "query": "慢",
"dict_language": "fr"
} }
``` ```
@ -410,6 +415,64 @@ Authorization: Bearer <your_jwt_token>
- **状态码**: - **状态码**:
- `200`: 查询成功 - `200`: 查询成功
#### 2.5 日语惯用语联想建议
- **接口**: `POST /search/list/idiom`
- **描述**: 针对日语惯用语返回联想候选,支持输入日文假名或中文汉字;若输入匹配汉字映射表,会并发查询假名结果并合并输出。
- **需要认证**: 是
- **请求体**:
```json
{
"query": "愛してる",
"dict_language": "jp"
}
```
- **响应示例**:
```json
{
"list": [
{
"id": 21,
"proverb": "愛してる",
"chi_exp": "我爱你"
}
]
}
```
- **状态码**:
- `200`: 查询成功
- `400`: 当 `dict_language` 不是 `jp` 时返回错误信息
#### 2.6 日语惯用语详情
- **接口**: `POST /search/idiom`
- **描述**: 根据惯用语 ID 返回详细信息并增加访问频次。
- **需要认证**: 是
- **查询参数**:
- `query_id`: 惯用语 ID (integer)
- **响应示例**:
```json
{
"result": {
"id": 21,
"text": "愛してる",
"search_text": "あいしてる",
"chi_exp": "我爱你",
"example": "私はあなたを愛してる。",
"freq": 57
}
}
```
- **状态码**:
- `200`: 查询成功
- `404`: 惯用语不存在
--- ---
### 3. 翻译模块 (`/translate`) ### 3. 翻译模块 (`/translate`)
@ -1061,7 +1124,7 @@ curl -X POST "http://127.0.0.1:8000/search/word" \
}' }'
# 4. 获取单词联想列表 # 4. 获取单词联想列表
curl -X POST "http://127.0.0.1:8000/search/word/list" \ curl -X POST "http://127.0.0.1:8000/search/list/word" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "Authorization: Bearer <your_token_here>" \ -H "Authorization: Bearer <your_token_here>" \
-d '{ -d '{

View File

@ -1,3 +1,4 @@
import asyncio
from typing import Literal, List from typing import Literal, List
from fastapi import APIRouter, Depends, HTTPException, Request, Form from fastapi import APIRouter, Depends, HTTPException, Request, Form
@ -5,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request, Form
from app.api.search_dict import service from app.api.search_dict import service
from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \ from app.api.search_dict.search_schemas import SearchRequest, WordSearchResponse, SearchItemFr, SearchItemJp, \
ProverbSearchRequest ProverbSearchRequest
from app.api.search_dict.service import suggest_autocomplete, accurate_proverb from app.api.search_dict.service import suggest_autocomplete
from app.api.word_comment.word_comment_schemas import CommentSet from app.api.word_comment.word_comment_schemas import CommentSet
from app.models import DefinitionJp, CommentFr, CommentJp from app.models import DefinitionJp, CommentFr, CommentJp
from app.models.fr import DefinitionFr, ProverbFr from app.models.fr import DefinitionFr, ProverbFr
@ -136,23 +137,10 @@ async def search(request: Request, body: SearchRequest, user=Depends(get_current
) )
@dict_search.post("/search/proverb")
async def proverb(request: Request, proverb_id: int, user=Depends(get_current_user)):
"""
用于法语谚语搜索
:param request:
:param body: 要求用户输入的内容必须为法语
:param user:
:return:
"""
content = await service.accurate_proverb(proverb_id=proverb_id)
return content
# TODO 相关度排序(转换为模糊匹配) # TODO 相关度排序(转换为模糊匹配)
# TODO 输入搜索框时反馈内容 # TODO 输入搜索框时反馈内容
@dict_search.post("/search/word/list") @dict_search.post("/search/list/word")
async def search_word_list(query_word: SearchRequest, user=Depends(get_current_user)): async def search_word_list(query_word: SearchRequest, user=Depends(get_current_user)):
""" """
检索时的提示接口 检索时的提示接口
@ -165,9 +153,9 @@ async def search_word_list(query_word: SearchRequest, user=Depends(get_current_u
return {"list": word_contents} return {"list": word_contents}
@dict_search.post("/search/proverb/list") @dict_search.post("/search/list/proverb")
async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)): async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get_current_user)):
lang = service.detect_language(text=query_word.query)[1] query, lang, _ = service.detect_language(text=query_word.query)
query = normalize_text(query_word.query) if lang == "fr" else query_word.query query = normalize_text(query_word.query) if lang == "fr" else query_word.query
suggest_proverbs = await service.suggest_proverb( suggest_proverbs = await service.suggest_proverb(
query=query_word.query, query=query_word.query,
@ -180,35 +168,53 @@ async def search_proverb_list(query_word: ProverbSearchRequest, user=Depends(get
@dict_search.post("/search/proverb") @dict_search.post("/search/proverb")
async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)): async def search_proverb(proverb_id: int = Form(...), user=Depends(get_current_user)):
result = await service.accurate_proverb(proverb_id=proverb_id) result = await service.accurate_idiom_proverb(search_id=proverb_id, model=ProverbFr, only_fields=["text", "chi_exp"])
return {"result": result} return {"result": result}
@dict_search.post("/search/idiom/list") @dict_search.post("/search/list/idiom")
async def search_idiom_list(query_idiom: ProverbSearchRequest): async def search_idiom_list(query_idiom: ProverbSearchRequest, user=Depends(get_current_user)):
if query_idiom.dict_language == "fr": if query_idiom.dict_language == "fr":
raise HTTPException(status_code=400, detail="Dict language Error") raise HTTPException(status_code=400, detail="Dict language Error")
trad_query, lang = service.detect_language(text=query_idiom.query)
mapping_query, lang, is_kangji = await service.detect_language(text=query_idiom.query)
query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query query = all_in_kana(text=query_idiom.query) if lang == "jp" else query_idiom.query
result = await service.suggest_proverb(
# ✅ 并发任务列表
tasks = [
service.suggest_proverb(
query=query, query=query,
lang=lang, lang=lang,
model=IdiomJp, model=IdiomJp,
search_field="search_text", search_field="search_text",
target_field="text", target_field="text",
) )
if lang == "zh": ]
trad_query = all_in_kana(text=query_idiom.query)
search_idioms_from_chi = await service.suggest_proverb( if lang == "zh" and is_kangji:
query=trad_query, # jp_query = all_in_kana(text=query_idiom.query)
tasks.append(
service.suggest_proverb(
query=mapping_query,
lang="jp", lang="jp",
model=IdiomJp, model=IdiomJp,
search_field="text",
) )
result[:0] = search_idioms_from_chi )
# ✅ 并发执行(返回结果顺序与任务顺序一致)
results = await asyncio.gather(*tasks)
# ✅ 合并结果
result = results[0]
if len(results) > 1:
result[:0] = results[1] # 将中文映射查询结果插到最前面
return {"list": result} return {"list": result}
@dict_search.post("/search/idiom") @dict_search.post("/search/idiom")
async def search_idiom(query_id: int): async def search_idiom(query_id: int, user=Depends(get_current_user)):
result = await accurate_proverb(proverb_id=query_id) result = await service.accurate_idiom_proverb(search_id=query_id, model=IdiomJp, only_fields=["id", "text", "search_text", "chi_exp", "example"])
return {"result": result} return {"result": result}

View File

@ -2,83 +2,70 @@ import re
from typing import List, Tuple, Dict, Literal, Type from typing import List, Tuple, Dict, Literal, Type
from fastapi import HTTPException from fastapi import HTTPException
from opencc import OpenCC
from tortoise import Tortoise, Model from tortoise import Tortoise, Model
from tortoise.expressions import Q from tortoise.expressions import Q
from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchResponse, ProverbSearchRequest from app.api.search_dict.search_schemas import SearchRequest, ProverbSearchRequest
from app.models import WordlistFr, WordlistJp from app.models import WordlistFr, WordlistJp, KangjiMapping
from app.models.fr import ProverbFr
from app.utils.all_kana import all_in_kana from app.utils.all_kana import all_in_kana
from app.utils.textnorm import normalize_text from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM from settings import TORTOISE_ORM
def detect_language(text: str) -> Tuple[str, Literal["fr", "zh", "jp", "other"]]: async def detect_language(text: str) -> Tuple[str, str, bool]:
""" """
自动检测输入语言: 自动检测输入语言:
- zh: 简体中文 - zh: 简体中文
- jp: 日语含假名或繁体/ - jp: 日语含假名或旧字
- fr: 拉丁字母法语等 - fr: 拉丁字母法语等
- other: 其他 - other: 其他
"""
cc_s2t = OpenCC('s2t') # 简体 → 繁体
cc_t2s = OpenCC('t2s') # 繁体 → 简体
返回:
(映射或原文本, 语言代码, 是否为含汉字且命中映射表的情况)
"""
JAPANESE_HIRAGANA = r"[\u3040-\u309F]" JAPANESE_HIRAGANA = r"[\u3040-\u309F]"
JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]" JAPANESE_KATAKANA = r"[\u30A0-\u30FF\u31F0-\u31FF]"
text = text.strip() text = text.strip()
if not text: if not text:
return "", "other" return "", "other", False
# ✅ Step 1: 假名检测 # ✅ Step 1: 全部假名(无汉字)
if re.search(JAPANESE_HIRAGANA, text) or re.search(JAPANESE_KATAKANA, text): if re.fullmatch(f"(?:{JAPANESE_HIRAGANA}|{JAPANESE_KATAKANA})+", text):
return text, "jp" return text, "jp", False
# ✅ Step 2: 汉字检测 # ✅ Step 2: 汉字检测
if re.search(r"[\u4e00-\u9fff]", text): if re.search(r"[\u4e00-\u9fff]", text):
# 简繁互转对比 # 优先判断是否为日语汉字
to_trad = cc_s2t.convert(text) jp_match = await KangjiMapping.get_or_none(kangji=text).only("kangji")
to_simp = cc_t2s.convert(text) if jp_match:
return text, "jp", True # 含汉字且命中日语列
# 如果输入等于繁体转换结果 → 繁体或日文汉字 # 再检查是否为中文汉字
if text == to_trad and text != to_simp: zh_match = await KangjiMapping.get_or_none(hanzi=text).only("hanzi", "kangji")
return text, "jp" if zh_match:
# 如果输入等于简体转换结果 → 简体中文 return zh_match.kangji, "zh", True # 含汉字且命中中文列
elif text == to_simp and text != to_trad:
return to_trad, "zh" # 注意返回的是繁体形式用于补充搜索
# 否则混合(既有简体又有繁体)
else:
# 混合时可优先认定为繁体(日语)
return to_trad, "jp"
# ✅ Step 3: 拉丁字母检测 # 若都不在映射表中,则为未映射的中文
return text, "zh", False
# ✅ Step 3: 拉丁字母检测(如法语)
if re.search(r"[a-zA-ZÀ-ÿ]", text): if re.search(r"[a-zA-ZÀ-ÿ]", text):
return text, "fr" return text, "fr", False
return text, "other" # ✅ Step 4: 其他情况(符号、空格等)
return text, "other", False
async def accurate_proverb(proverb_id: int) -> ProverbSearchResponse: async def accurate_idiom_proverb(search_id: int, model: Type[Model], only_fields: List[str] = None):
"""对于查询法语谚语的精准查询,返回详细信息""" if "freq" not in only_fields:
proverb = await ProverbFr.get_or_none(id=proverb_id) only_fields.append("freq")
if not proverb: result = await model.get_or_none(id=search_id).only(*only_fields)
raise HTTPException(status_code=404, detail="Proverb not found") if not result:
proverb.freq = proverb.freq + 1 raise HTTPException(status_code=404, detail="Target not found")
await proverb.save() result.freq = result.freq + 1
return ProverbSearchResponse( await result.save(update_fields=["freq"])
proverb_text=proverb.text, return result
chi_exp=proverb.chi_exp,
)
async def accurate_idiom(idiom_id: int):
proverb = await ProverbFr.get_or_none(id=idiom_id)
if not proverb:
raise HTTPException(status_code=404, detail="Proverb not found")
proverb.freq = proverb.freq + 1
await proverb.save()
return proverb
async def suggest_proverb( async def suggest_proverb(
@ -90,54 +77,37 @@ async def suggest_proverb(
chi_exp_field: str = "chi_exp", chi_exp_field: str = "chi_exp",
limit: int = 10, limit: int = 10,
) -> List[Dict[str, str]]: ) -> List[Dict[str, str]]:
"""
通用搜索建议函数用于多语言谚语表
参数:
query: 搜索关键词
lang: 'fr' 'zh'
model: Tortoise ORM 模型类例如 ProverbFr
proverb_field: 外语谚语字段名
chi_exp_field: 中文释义字段名
limit: 每类匹配的最大返回数量
搜索逻辑:
1. 根据语言选择搜索字段
2. 优先匹配以输入开头的结果
3. 其次匹配包含输入但非开头的结果
4. 合并去重后返回
"""
keyword = query.strip() keyword = query.strip()
if not keyword: if not keyword:
return [] return []
# ✅ 根据语言选择搜索字段 # ✅ 搜索条件:中文时双字段联合匹配
if lang == "zh": if lang == "zh":
startswith_field = f"{chi_exp_field}__istartswith" start_condition = Q(**{f"{chi_exp_field}__istartswith": keyword}) | Q(
contains_field = f"{chi_exp_field}__icontains" **{f"{search_field}__istartswith": keyword})
contain_condition = Q(**{f"{chi_exp_field}__icontains": keyword}) | Q(**{f"{search_field}__icontains": keyword})
else: else:
startswith_field = f"{search_field}__istartswith" start_condition = Q(**{f"{search_field}__istartswith": keyword})
contains_field = f"{search_field}__icontains" contain_condition = Q(**{f"{search_field}__icontains": keyword})
# ✅ 1. 开头匹配 # ✅ 1. 开头匹配
start_matches = await ( start_matches = await (
model.filter(**{startswith_field: keyword}) model.filter(start_condition)
.order_by("-freq") .order_by("-freq", "id")
.limit(limit) .limit(limit)
.values("id", target_field, search_field, chi_exp_field) .values("id", target_field, chi_exp_field, "search_text")
) )
# ✅ 2. 包含匹配(开头) # ✅ 2. 包含匹配(但不是开头)
contain_matches = await ( contain_matches = await (
model.filter( model.filter(contain_condition & ~start_condition)
Q(**{contains_field: keyword}) & ~Q(**{startswith_field: keyword}) .order_by("-freq", "id")
)
.order_by("-freq")
.limit(limit) .limit(limit)
.values("id", target_field, search_field, chi_exp_field) .values("id", target_field, chi_exp_field, "search_text")
) )
# ✅ 3. 合并去重保持顺序 # ✅ 3. 合并去重保持顺序
results: List[Dict[str, str]] = [] results = []
seen_ids = set() seen_ids = set()
for row in start_matches + contain_matches: for row in start_matches + contain_matches:
if row["id"] not in seen_ids: if row["id"] not in seen_ids:
@ -145,11 +115,12 @@ async def suggest_proverb(
results.append({ results.append({
"id": row["id"], "id": row["id"],
"proverb": row[target_field], "proverb": row[target_field],
"search_text": row[search_field], "search_text": row["search_text"],
"chi_exp": row[chi_exp_field] "chi_exp": row[chi_exp_field],
}) })
return results # ✅ 截断最终返回数量
return results[:limit]
async def suggest_autocomplete(query: SearchRequest, limit: int = 10): async def suggest_autocomplete(query: SearchRequest, limit: int = 10):

View File

@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from typing import Tuple, TypeVar from typing import Tuple, TypeVar, Optional
import pandas as pd import pandas as pd
from tortoise import fields from tortoise import fields
@ -18,8 +18,8 @@ class WordlistJp(Model):
text = fields.CharField(max_length=40, description="单词") text = fields.CharField(max_length=40, description="单词")
hiragana = fields.CharField(max_length=60, description="假名", null=False) hiragana = fields.CharField(max_length=60, description="假名", null=False)
freq = fields.IntField(default=0) freq = fields.IntField(default=0)
definitions : fields.ReverseRelation["DefinitionJp"] definitions: fields.ReverseRelation["DefinitionJp"]
attachments : fields.ReverseRelation["AttachmentJp"] attachments: fields.ReverseRelation["AttachmentJp"]
class Meta: class Meta:
table = "wordlist_jp" table = "wordlist_jp"
@ -74,6 +74,7 @@ class DefinitionJp(Model):
class Meta: class Meta:
table = "definitions_jp" table = "definitions_jp"
class PosType(Model): class PosType(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False) pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False)
@ -81,6 +82,7 @@ class PosType(Model):
class Meta: class Meta:
table = "pos_type" table = "pos_type"
class PronunciationTestJp(Model): class PronunciationTestJp(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
text = fields.TextField(description="朗读文段") text = fields.TextField(description="朗读文段")
@ -88,6 +90,7 @@ class PronunciationTestJp(Model):
class Meta: class Meta:
table = "pronunciationtest_jp" table = "pronunciationtest_jp"
class IdiomJp(Model): class IdiomJp(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
text = fields.TextField(null=False) text = fields.TextField(null=False)
@ -100,12 +103,20 @@ class IdiomJp(Model):
class Meta: class Meta:
table = "idiom_jp" table = "idiom_jp"
class KangjiMapping(Model): class KangjiMapping(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
hanzi= fields.TextField(null=False) hanzi = fields.TextField(null=False)
kangji= fields.TextField(null=False) kangji = fields.TextField(null=False)
note= fields.TextField(null=False) note = fields.TextField(null=False)
created_at = fields.DatetimeField(auto_now_add=True) created_at = fields.DatetimeField(auto_now_add=True)
@classmethod
async def chi2kangji(text_chi: str) -> Optional[str]:
mapping = await KangjiMapping.get_or_none(hanzi=text_chi)
if not mapping:
return None
return mapping.kangji
class Meta: class Meta:
table = "kangji_mapping_zh_jp" table = "kangji_mapping_zh_jp"

View File

@ -18,7 +18,7 @@ from app.api.user.routes import users_router
from app.api.word_comment.routes import word_comment_router from app.api.word_comment.routes import word_comment_router
from app.core.redis import init_redis, close_redis from app.core.redis import init_redis, close_redis
from app.utils.phone_encrypt import PhoneEncrypt from app.utils.phone_encrypt import PhoneEncrypt
from settings import TORTOISE_ORM from settings import ONLINE_SETTINGS
@asynccontextmanager @asynccontextmanager
@ -46,7 +46,7 @@ app.add_middleware(
register_tortoise( register_tortoise(
app=app, app=app,
config=TORTOISE_ORM, config=ONLINE_SETTINGS,
) )
app.include_router(users_router, tags=["User API"], prefix="/users") app.include_router(users_router, tags=["User API"], prefix="/users")