diff --git a/.idea/dict_server.iml b/.idea/dict_server.iml index 5305fe2..53b48ab 100644 --- a/.idea/dict_server.iml +++ b/.idea/dict_server.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index dbda99f..1b75482 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/app/api/users.py b/app/api/users.py index 7f34085..c99f7e8 100644 --- a/app/api/users.py +++ b/app/api/users.py @@ -6,7 +6,7 @@ import redis.asyncio as redis from app.models.base import ReservedWords, User, Language from app.utils.security import verify_password, hash_password, validate_password, validate_username, get_current_user -from settings import SECRET_KEY +from settings import settings from app.core.redis import get_redis from app.schemas.user_schemas import UserIn, UserOut, UpdateUserRequest, UserLoginRequest @@ -67,7 +67,7 @@ async def user_login(user_in: UserLoginRequest): "is_admin": user.is_admin, } - token = jwt.encode(payload, SECRET_KEY, algorithm="HS256") + token = jwt.encode(payload, settings.SECRET_KEY, algorithm="HS256") return { "access_token": token, diff --git a/app/models/fr.py b/app/models/fr.py index 42825ac..60e7eb3 100644 --- a/app/models/fr.py +++ b/app/models/fr.py @@ -12,42 +12,17 @@ sheet_name_fr = "法英中释义" class WordlistFr(Model): id = fields.IntField(pk=True) - language = fields.CharField(max_length=20, description="单词语种") text = fields.CharField(max_length=40, unique=True, description="单词") - definitions = fields.ReverseRelation("DefinitionFr") - attachments = fields.ReverseRelation("AttachmentsFr") + definitions: fields.ReverseRelation["DefinitionFr"] + attachments: fields.ReverseRelation["AttachmentFr"] + freq = fields.IntField() # 词频排序用 + search_text = fields.CharField(max_length=255, index=True) # 检索字段 # attachment = fields.ForeignKeyField("models.Attachment", related_name="wordlists", on_delete=fields.CASCADE) # source = fields.CharField(max_length=20, description="", null=True) class Meta: table = "wordlist_fr" - T = TypeVar("T", bound=Model) - - @classmethod - async def update_or_create(cls: Type[T], **kwargs) -> Tuple[T, bool]: - print("传入参数为:", kwargs) - if not kwargs: - raise ValueError("必须提供至少一个字段作为参数") - - created: bool = False - - # 使用 kwargs 中第一个字段作为查找条件 - first_key = next(iter(kwargs)) - lookup = {first_key: kwargs[first_key]} - - word = await cls.filter(**lookup).first() # 参数展开语法 - if word: - for k, v in kwargs.items(): - if k != first_key: - setattr(word, k, v) - await word.save() - else: - await cls.create(**kwargs) - created = True - - return word, created - class AttachmentFr(Model): id = fields.IntField(pk=True) @@ -63,91 +38,10 @@ class AttachmentFr(Model): class DefinitionFr(Model): id = fields.IntField(pk=True) word = fields.ForeignKeyField("models.WordlistFr", related_name="definitions", on_delete=fields.CASCADE) - pos = fields.CharEnumField(PosEnumFr, max_length=30) # ✅ 把词性放在释义层面 + pos = fields.CharEnumField(PosEnumFr, max_length=30, null=True) # ✅ 把词性放在释义层面 meaning = fields.TextField(description="单词释义") # 如:“学习” example = fields.TextField(null=True, description="单词例句") eng_explanation = fields.TextField(null=True, description="English explanation") class Meta: table = "definitions_fr" - - @classmethod - async def init_from_xlsx( - cls, - filepath: str, - sheet_name: str - ): - """ - Initiate the database from xlsx file. Only read in data without checking - whether the content already exists. - :param filepath: receive both relative or absolute path - :param sheet_name: specific sheet name inside the .xlsx file - :return: None - """ - df = pd.read_excel(filepath, sheet_name=sheet_name, na_filter=True) - df.columns = [col.strip() for col in df.columns] - df.dropna(how="all", inplace=True) - - # create_cnt = 0 - DEF_COUNT = 1 - - for row in df.itertuples(): - word = row.单词 - cls_word = await WordlistFr.filter(text=word).first() - if cls_word is None: - print(f"未找到 word: {word}") - continue - pos = getattr(row, f"词性{DEF_COUNT}") - if pd.isna(pos): - continue - meaning = getattr(row, f"中文释义{DEF_COUNT}") - eng_exp = getattr(row, f"英语释义{DEF_COUNT}") - await DefinitionFr.create( - part_of_speech=pos, - meaning=meaning, - eng_explanation=eng_exp, - word=cls_word - ) - - # TODO revise the function (check update or create by id) - @classmethod - async def update_or_create_meaning( - cls, - word_obj, - target_language_obj, - part_of_speech: str, - meaning: str, - example: str = None, - eng_explanation: str = None, - ) -> tuple["DefinitionFr", bool]: - """ - 查询某个单词是否已有该释义(依据四元组作为唯一标识),存在则更新,不存在则新增。 - 返回:(对象, 是否为新创建) - """ - query = { - "word": word_obj, - "target_language": target_language_obj, - "part_of_speech": part_of_speech, - "meaning": meaning - } - - obj = await cls.filter(**query).first() - created = False - - if obj: - # 可更新其他字段 - obj.example = example - obj.eng_explanation = eng_explanation - await obj.save() - else: - obj = await cls.create( - word=word_obj, - target_language=target_language_obj, - part_of_speech=part_of_speech, - meaning=meaning, - example=example, - eng_explanation=eng_explanation, - ) - created = True - - return obj, created diff --git a/app/models/jp.py b/app/models/jp.py index 671c925..a43b398 100644 --- a/app/models/jp.py +++ b/app/models/jp.py @@ -16,8 +16,8 @@ sheet_name_jp = "日汉释义" class WordlistJp(Model): id = fields.IntField(pk=True) text = fields.CharField(max_length=40, description="单词") - definitions = fields.ReverseRelation("DefinitionJp") - attachments = fields.ReverseRelation("AttachmentsJp") + definitions : fields.ReverseRelation["DefinitionJp"] + attachments : fields.ReverseRelation["AttachmentJp"] class Meta: table = "wordlist_jp" diff --git a/app/models/signals.py b/app/models/signals.py new file mode 100644 index 0000000..75c113c --- /dev/null +++ b/app/models/signals.py @@ -0,0 +1,41 @@ +from tortoise.signals import pre_save +from tortoise import BaseDBAsyncClient +from typing import Optional + +from app.utils.textnorm import normalize_text +from app.models.fr import WordlistFr + + +@pre_save(WordlistFr) +async def wordlist_fr_pre_save( + sender: type[WordlistFr], + instance: WordlistFr, + using_db: BaseDBAsyncClient, + update_fields: Optional[list[str]] +) -> None: + """ + 仅当 text 变更时,同步 search_text。 + - 新建:总是写入 search_text + - 修改:只有当 text 在本次更新范围内,或 text 实际发生变化时才更新 + - 若调用方用了 update_fields,只包含 text,则自动把 'search_text' 追加进去,确保写回 + """ + desired = normalize_text(instance.text or "") + # 不变则不写,减少无谓 UPDATE + if instance.search_text == desired: + return + + # 情况 1:完整更新(没有传 update_fields) + if update_fields is None: + instance.search_text = desired + return # ✅ 会写入 + + # 情况 2:部分更新——只有当这次确实更新了 text,才同步 search_text + if "text" in update_fields: + instance.search_text = desired + # update_fields 可能是 tuple,转成 list 再补充 + fields = list(update_fields) + if "search_text" not in fields: + fields.append("search_text") + # 交还给 ORM:确保此次 UPDATE 包含 search_text + instance._update_fields = fields + # 否则(这次没更 text),不动 search_text diff --git a/app/schemas/admin_schemas.py b/app/schemas/admin_schemas.py index e8c5dfd..18c911e 100644 --- a/app/schemas/admin_schemas.py +++ b/app/schemas/admin_schemas.py @@ -3,10 +3,6 @@ from enum import Enum from pydantic import BaseModel, validator, field_validator, Field from typing import Optional, Literal, List -from tortoise.exceptions import DoesNotExist - -from app.models.fr import WordlistFr - class PosEnumFr(str, Enum): # noun @@ -21,6 +17,10 @@ class PosEnumFr(str, Enum): v_i = "v.i." v_pr = "v.pr." v_t_i = "v.t./v.i." + v_t_dir = "v.t.dir." + v_t_ind = "v.t.ind." + v_t_pr = "v.t.(v.pr.)" + v_i_ind = "v.t.ind./v.i." adj = "adj." # adj adv = "adv." # adv @@ -29,6 +29,8 @@ class PosEnumFr(str, Enum): conj = "conj." interj = "interj." chauff = "chauff" + art = "art." + class PosEnumJp(str, Enum): @@ -55,18 +57,18 @@ class CreateWord(BaseModel): @classmethod @field_validator("eng_explanation") def validate_eng_explanation(cls, v): - if cls.language is "jp" and v: + if cls.language == "jp" and v: raise ValueError("Japanese word has no English explanation") - if cls.language is "fr" and v is None or v == "": + if cls.language == "fr" and v is None or v == "": raise ValueError("French word must have English explanation") return v @classmethod @field_validator("pos") def validate_pos(cls, v): - if cls.language is "fr" and v not in PosEnumFr: + if cls.language == "fr" and v not in PosEnumFr: raise ValueError("Pos is not a valid type") - if cls.language is "jp" and v not in PosEnumJp: + if cls.language == "jp" and v not in PosEnumJp: raise ValueError("Pos is not a valid type") return v diff --git a/app/utils/textnorm.py b/app/utils/textnorm.py new file mode 100644 index 0000000..92cbe4e --- /dev/null +++ b/app/utils/textnorm.py @@ -0,0 +1,23 @@ +import re +import unicodedata + + +def normalize_text(s: str) -> str: + """ + 规范化字符串,用于搜索/存储 search_text + - Unicode 标准化 + - 去除重音符号(é -> e) + - 转小写 + - 去掉前后空格,多空格合并 + """ + if not s: + return "" + # 1. Unicode 标准化(NFKD 拆分) + s = unicodedata.normalize("NFKD", s) + # 2. 去掉音标/重音符 + s = "".join(ch for ch in s if not unicodedata.combining(ch)) + # 3. 转小写 + s = s.lower() + # 4. 去掉首尾空格 & 合并多个空格 + s = re.sub(r"\s+", " ", s.strip()) + return s diff --git a/main.py b/main.py index 8b6f502..4eccc35 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from settings import TORTOISE_ORM from app.api.users import users_router from app.api.admin.router import admin_router from app.core.redis import init_redis_pool +import app.models.signals @asynccontextmanager diff --git a/scripts/DictTable_20250811.xlsx b/scripts/DictTable_20250811.xlsx new file mode 100644 index 0000000..03db959 Binary files /dev/null and b/scripts/DictTable_20250811.xlsx differ diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/backfill_search_text.py b/scripts/backfill_search_text.py new file mode 100644 index 0000000..f93ce6e --- /dev/null +++ b/scripts/backfill_search_text.py @@ -0,0 +1,17 @@ +import asyncio +from tortoise import Tortoise, run_async +from app.models.fr import WordlistFr +from app.utils.textnorm import normalize_text +from settings import TORTOISE_ORM + +async def main(): + await Tortoise.init(config=TORTOISE_ORM) + async for w in WordlistFr.all().only("id", "text", "search_text"): # type: WordlistFr + want = normalize_text(w.text) + if w.search_text != want: + w.search_text = want + await w.save(update_fields=["search_text"]) + await Tortoise.close_connections() + +if __name__ == "__main__": + run_async(main()) \ No newline at end of file diff --git a/scripts/update_fr.py b/scripts/update_fr.py new file mode 100644 index 0000000..9389c37 --- /dev/null +++ b/scripts/update_fr.py @@ -0,0 +1,103 @@ +import asyncio +from pathlib import Path + +import pandas as pd +from tortoise import Tortoise +from tortoise.exceptions import MultipleObjectsReturned + +from app.models.fr import DefinitionFr, WordlistFr +from settings import TORTOISE_ORM +import app.models.signals + +xlsx_name = "./DictTable_20250811.xlsx" +xlsx_path = Path(xlsx_name) + + +def pos_process(pos: str) -> str: + pos = pos.replace(" ", "") + pos = pos.replace(",", "") + if not pos.endswith(".") and not pos.endswith(")") and pos != "chauff": + pos = pos + "." + return pos + + +async def import_wordlist_fr(path: Path = xlsx_path, sheet_name: str = "法英中释义"): + df = pd.read_excel(path, sheet_name=sheet_name) + df.columns = [col.strip() for col in df.columns] + + for row in df.itertuples(): + word = str(row.单词).strip() + if pd.isna(word): + break + + word_obj, created = await WordlistFr.get_or_create(text=word, defaults={"freq": 0}) + if created: + print(f"✅ 新增词条: {word}") + else: + print(f"⚠️ 已存在: {word},跳过") + + +async def import_def_fr( + path: Path = xlsx_path, + sheet_name: str = "法英中释义" +): + df = pd.read_excel(path, sheet_name=sheet_name) + df.columns = [col.strip() for col in df.columns] + + for row in df.itertuples(): + word = row.单词 + if pd.isna(word): + continue + + word = str(word).strip() + + # 查找 WordlistFr 实例(注意异常处理) + try: + cls_word = await WordlistFr.get(text=word) + except MultipleObjectsReturned: + ids = await WordlistFr.filter(text=word).values_list("id", flat=True) + print(f"❗ 重复单词 {word},id为: {' '.join(str(i) for i in ids)}") + continue + except Exception as e: + print(f"❌ 查找单词 {word} 出错: {e}") + continue + + # 字段处理 + example = None if pd.isna(row.法语例句1) else str(row.法语例句1).strip() + pos = None if pd.isna(row.词性1) else pos_process(str(row.词性1).strip()) + eng_exp = None if pd.isna(row.英语释义1) else str(row.英语释义1).strip() + chi_exp = str(row[2]).strip() + + # 去重:同一个词条不能有重复释义(同 pos + meaning) + exists = await DefinitionFr.filter( + word=cls_word, + pos=pos, + meaning=chi_exp + ).exists() + if exists: + print(f"⚠️ 已存在释义,跳过:{word} - {pos} - {chi_exp[:10]}...") + continue + + # 创建定义 + try: + await DefinitionFr.create( + word=cls_word, + pos=pos, + eng_explanation=eng_exp, + meaning=chi_exp, + example=example, + ) + print(f"✅ 导入释义:{word} - {pos}") + except Exception as e: + print(f"❌ 插入释义失败:{word} - {pos},错误: {e}") + + +async def main(): + await Tortoise.init(config=TORTOISE_ORM) + await DefinitionFr.all().delete() + await import_def_fr() + # await import_wordlist_fr() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/settings.py b/settings.py index 2981ee6..999028d 100644 --- a/settings.py +++ b/settings.py @@ -2,21 +2,8 @@ from pydantic.v1 import BaseSettings TORTOISE_ORM = { 'connections': { - 'default': { - # 'engine': 'tortoise.backends.asyncpg', PostgreSQL - 'engine': 'tortoise.backends.mysql', # MySQL or Mariadb - 'credentials': { - 'host': '127.0.0.1', - 'port': '3306', - 'user': 'root', - 'password': 'enterprise', - 'database': 'dict', - 'minsize': 1, - 'maxsize': 5, - 'charset': 'utf8mb4', - "echo": True - } - }, + "default": "mysql://local_admin:enterprise@127.0.0.1:3306/dict", + "production": "mysql://local_admin:enterprise@127.0.0.1:3306/prod_db", }, 'apps': { 'models': { @@ -34,8 +21,10 @@ TORTOISE_ORM = { 'timezone': 'Asia/Shanghai' } + class Settings(BaseSettings): USE_OAUTH = False SECRET_KEY = "asdasdasd-odjfnsodfnosidnfdf-0oq2j01j0jf0i1ej0fij10fd" + settings = Settings()