diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml index 91f0ed8..cb4c62b 100644 --- a/.idea/dataSources.xml +++ b/.idea/dataSources.xml @@ -1,11 +1,11 @@ - + mysql.8 true com.mysql.cj.jdbc.Driver - jdbc:mysql://localhost:3306/dict + jdbc:mysql://127.0.0.1:3306/dict $ProjectFileDir$ diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml index 74756e4..3d0c60c 100644 --- a/.idea/sqldialects.xml +++ b/.idea/sqldialects.xml @@ -3,6 +3,7 @@ + \ No newline at end of file diff --git a/app/api/users.py b/app/api/users.py index c99f7e8..482f7ef 100644 --- a/app/api/users.py +++ b/app/api/users.py @@ -33,8 +33,14 @@ async def register(user_in: UserIn): } -@users_router.put("/update") +@users_router.put("/update", deprecated=False) async def user_modification(updated_user: UpdateUserRequest, current_user: User = Depends(get_current_user)): + """ + + :param updated_user: Pydantic 模型验证修改内容(根据JSON内容修改对应字段) + :param current_user: + :return: + """ reserved_words = await ReservedWords.filter(category="username").values_list("reserved", flat=True) # 验证当前密码 if not await verify_password(updated_user.current_password, current_user.password_hash): @@ -50,7 +56,6 @@ async def user_modification(updated_user: UpdateUserRequest, current_user: User if updated_user.new_password: current_user.password_hash = hash_password(updated_user.new_password) - @users_router.post("/login") async def user_login(user_in: UserLoginRequest): user = await User.get_or_none(name=user_in.name) diff --git a/app/models/__init__.py b/app/models/__init__.py index 8c7ac3d..5fa0459 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -1,4 +1,4 @@ from . import signals from .fr import WordlistFr, DefinitionFr, AttachmentFr -from .jp import WordlistJp, DefinitionJp, AttachmentJp +from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType from .base import User \ No newline at end of file diff --git a/app/models/jp.py b/app/models/jp.py index a43b398..b3c27a2 100644 --- a/app/models/jp.py +++ b/app/models/jp.py @@ -67,7 +67,14 @@ class DefinitionJp(Model): word = fields.ForeignKeyField("models.WordlistJp", related_name="definitions", on_delete=fields.CASCADE) meaning = fields.TextField(description="单词释义") example = fields.TextField(null=True, description="单词例句") - pos = fields.CharEnumField(PosEnumJp, max_length=30, null=True) + pos = fields.ManyToManyField("models.PosType", related_name="definitions", on_delete=fields.CASCADE) class Meta: table = "definitions_jp" + +class PosType(Model): + id = fields.IntField(pk=True) + pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False) + + class Meta: + table = "pos_type" diff --git a/app/schemas/admin_schemas.py b/app/schemas/admin_schemas.py index 18c911e..19bd93a 100644 --- a/app/schemas/admin_schemas.py +++ b/app/schemas/admin_schemas.py @@ -37,10 +37,22 @@ class PosEnumJp(str, Enum): noun = "名词" adj = "形容词" adj_v = "形容动词" + adv = "连用" v1 = "一段动词" v5 = "五段动词" help = "助词" - + self = "自动词" + other = "他动词" + tail = "接尾" + self_other = "自他动词" + follow = "接续" + habit = "惯用" + excl = "感叹词" + ka_v = "カ変" + sa_v = "サ変" + conn = "连体" + quantity = "量词" + pron = "代词" class CreateWord(BaseModel): word: str diff --git a/app/schemas/user_schemas.py b/app/schemas/user_schemas.py index 93bae23..6d4e070 100644 --- a/app/schemas/user_schemas.py +++ b/app/schemas/user_schemas.py @@ -1,5 +1,5 @@ from pydantic import BaseModel -from typing import Literal +from typing import Literal, Optional default_portrait_url = '#' @@ -46,9 +46,9 @@ class UserOut(BaseModel): class UpdateUserRequest(BaseModel): - current_password: str - new_username: str - new_password: str + current_password: Optional[str] = None + new_username: Optional[str] = None + new_password: Optional[str] = None new_language: Literal["jp", "fr", "private"] = "private" # lang_pref: str = "jp" diff --git a/requirements.txt b/requirements.txt index ee34e26..de7e289 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +fugashi +pykakasi aerich==0.9.1 aiosqlite==0.21.0 annotated-types==0.7.0 diff --git a/scripts/update_jp.py b/scripts/update_jp.py new file mode 100644 index 0000000..2dbb7f1 --- /dev/null +++ b/scripts/update_jp.py @@ -0,0 +1,223 @@ +import asyncio +import re +import unicodedata +import jaconv +from pathlib import Path + +import pandas as pd +from fugashi import Tagger +from pykakasi import kakasi +from tortoise import Tortoise +from tortoise.exceptions import MultipleObjectsReturned + +from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType +from settings import TORTOISE_ORM + +xlsx_name = "./DictTable-20250823.xlsx" +xlsx_path = Path(xlsx_name) + + +def normalize_jp_text(text: str) -> str: + # Unicode标准化(全角半角统一) + text = unicodedata.normalize("NFKC", text) + # 去除前后空格和常见不可见字符(零宽空格、换行符、制表符等) + text = re.sub(r'[\u200b-\u200f\u202a-\u202e\r\n\t]', '', text) + return text.strip() + + +async def pos_process(pos: str): + # 映射简写到标准枚举值 + mapping = { + "形": "形容词", + "形动": "形容动词", + "感叹": "感叹词", + "连体词": "连体", + "副词": "连用", + "自动": "自动词", + "他动": "他动词", + "自他动": "自他动词", + "五段": "五段动词", + "一段": "一段动词", + "接续词": "接续", + "カ变动词": "カ変", + "サ变动词": "サ変", + } + + # 去除空格 + 替换映射 + pos_list = [mapping.get(p.strip(), p.strip()) for p in pos.split("・") if p.strip()] + if "动词" in pos_list: + return None, True + + # 查询匹配的 PosType 实例(建议加 set 去重) + pos_type_objs = await PosType.filter(pos_type__in=list(set(pos_list))) + return pos_type_objs, False + + +# 初始化分词器 +tagger = Tagger() + +# 初始化 kakasi 转换器 +kakasi_inst = kakasi() +kakasi_inst.setMode("H", "a") # 平假名 to 罗马字 +kakasi_inst.setMode("K", "a") # 片假名 to 罗马字 +kakasi_inst.setMode("J", "a") # 汉字按音读转假名再转罗马字 +kakasi_inst.setMode("r", "Hepburn") # 使用 Hepburn 拼音规则 +converter = kakasi_inst.getConverter() + + +def is_kana_only(text: str) -> bool: + """ + 判断是否是纯假名(不含汉字) + """ + for ch in text: + if not ('\u3040' <= ch <= '\u309F' or '\u30A0' <= ch <= '\u30FF'): + return False + return True + + +def to_kana(word: str) -> str: + # 如果全为假名,则直接返回 + if is_kana_only(word): + return word + + # 否则用 fugashi 分词并拼接假名(假设用 `feature.kana`) + tokens = tagger(word) + kana_list = [] + for token in tokens: + # token.feature.kana 是 Unidic 词典中的假名字段 + kana = token.feature.get('kana') or token.surface + kana_list.append(kana) + + return ''.join(kana_list) + + +def kana_to_romaji(text: str) -> str: + """ + 将日文文本转换为罗马字(假名优先,汉字使用Unidic读音推测) + """ + + # 用fugashi解析词并取其假名 + kana_seq = [] + for word in tagger(text): + kana = word.feature[7] if len(word.feature) > 7 and word.feature[7] else word.surface + kana_seq.append(kana) + + joined_kana = ''.join(kana_seq) + romaji = converter.do(joined_kana) + return romaji + + +async def import_wordlist_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"): + df = pd.read_excel(path, sheet_name=sheet_name) + df.columns = [col.strip() for col in df.columns] + + for row in df.itertuples(): + word = normalize_jp_text(str(row.单词)) + if pd.isna(word): + continue + + word_obj, created = await WordlistJp.get_or_create(text=word, defaults={"freq": 0}) + if created and word == 'また': + print(f"✅ 新增词条: {word}") + elif not created: + print(f"⚠️ 已存在: {word},跳过") + else: + pass + + +async def import_def_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"): + df = pd.read_excel(path, sheet_name=sheet_name) + df.columns = [col.strip() for col in df.columns] + + for row in df.itertuples(): + word = normalize_jp_text(str(row.单词)) + if pd.isna(word): + continue + + word = str(word).strip() + + try: + cls_word = await WordlistJp.get(text=word) + except MultipleObjectsReturned: + ids = await WordlistJp.filter(text=word).values_list("id", flat=True) + print(f"❗ 重复单词 {word},id为: {' '.join(str(i) for i in ids)}") + continue + except Exception as e: + print(f"❌ 查找单词 {word} 出错: {e}") + continue + + # 字段处理 + example = None if pd.isna(row.日语例句1) else normalize_jp_text(str(row.日语例句1)) + if not pd.isna(row.词性): + pos_obj, jump = await pos_process(str(row.词性)) + if jump: + continue + else: + print(f"❌ {word} 的词性为空,跳过") + continue + chi_exp = str(row[4]).strip() + + exists = await DefinitionJp.filter( + word=cls_word, + meaning=chi_exp, + ).exists() + if exists: + print(f"⚠️ 已存在释义,跳过:{word} - {chi_exp[:10]}...") + continue + + try: + new_item = await DefinitionJp.create( + word=cls_word, + meaning=chi_exp, + example=example, + ) + await new_item.pos.add(*pos_obj) + print(f"✅ 导入释义:{word}") + except Exception as e: + print(f"❌ 插入释义失败:{word},错误: {e}") + + +async def import_attachment(path: Path = xlsx_path, sheet_name: str = "日汉释义"): + df = pd.read_excel(path, sheet_name=sheet_name) + df.columns = [col.strip() for col in df.columns] + + # 统一清洗后去重词汇列表 + unique_words = df["单词"].dropna().map(lambda x: normalize_jp_text(str(x))).unique().tolist() + + # 批量获取所有 WordlistJp 实例 + word_objs = await WordlistJp.filter(text__in=unique_words) + word_map = {normalize_jp_text(w.text): w for w in word_objs} + + for row in df.itertuples(): + word = normalize_jp_text(str(row.单词)) + if pd.isna(word): + continue + word_obj = word_map.get(word) + if not word_obj: + print(f"❌ 未找到词条:{word},跳过") + print(f"[DEBUG] 原始: {repr(row.单词)} → 标准化后: {normalize_jp_text(str(row.单词))}") + print(f"编码: {[hex(ord(c)) for c in str(row.单词)]}") + continue + + hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2])) + romaji = kana_to_romaji(word) + + await AttachmentJp.get_or_create( + word=word_obj, + hiragana=hiragana, + romaji=romaji, + ) + + +async def main(): + await Tortoise.init(config=TORTOISE_ORM) + # await DefinitionJp.all().delete() # TRUNCATE TABLE definitions_fr; + # await WordlistJp.all().delete() + # await AttachmentJp.all().delete() + # await import_wordlist_jp() + # await import_def_jp() + await import_attachment() + + +if __name__ == '__main__': + asyncio.run(main())