调整表结构以适配日语词典结构
requirements.txt:
新增依赖包
admin_schemas.py:
调整日语词性Enum
This commit is contained in:
Miyamizu-MitsuhaSang 2025-08-27 16:41:37 +08:00
parent b4f3ba6c6a
commit 5b2e7890f8
9 changed files with 261 additions and 11 deletions

View File

@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="dict@localhost" uuid="002414ae-d165-4dd5-b083-7d2a09fa7184">
<data-source source="LOCAL" name="dict@127.0.0.1" uuid="002414ae-d165-4dd5-b083-7d2a09fa7184">
<driver-ref>mysql.8</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
<jdbc-url>jdbc:mysql://localhost:3306/dict</jdbc-url>
<jdbc-url>jdbc:mysql://127.0.0.1:3306/dict</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
<data-source source="LOCAL" name="test_db@124.221.145.135" uuid="a23b927c-ada0-4708-ba6e-4db2b9dab6c1">

View File

@ -3,6 +3,7 @@
<component name="SqlDialectMappings">
<file url="file://$PROJECT_DIR$/migrations/models/11_20250803092810_update.py" dialect="MySQL" />
<file url="file://$PROJECT_DIR$/migrations/models/16_20250806104900_update.py" dialect="GenericSQL" />
<file url="file://$PROJECT_DIR$/migrations/models/21_20250827100315_update.py" dialect="MySQL" />
<file url="file://$PROJECT_DIR$/scripts/update_fr.py" dialect="MySQL" />
</component>
</project>

View File

@ -33,8 +33,14 @@ async def register(user_in: UserIn):
}
@users_router.put("/update")
@users_router.put("/update", deprecated=False)
async def user_modification(updated_user: UpdateUserRequest, current_user: User = Depends(get_current_user)):
"""
:param updated_user: Pydantic 模型验证修改内容根据JSON内容修改对应字段
:param current_user:
:return:
"""
reserved_words = await ReservedWords.filter(category="username").values_list("reserved", flat=True)
# 验证当前密码
if not await verify_password(updated_user.current_password, current_user.password_hash):
@ -50,7 +56,6 @@ async def user_modification(updated_user: UpdateUserRequest, current_user: User
if updated_user.new_password:
current_user.password_hash = hash_password(updated_user.new_password)
@users_router.post("/login")
async def user_login(user_in: UserLoginRequest):
user = await User.get_or_none(name=user_in.name)

View File

@ -1,4 +1,4 @@
from . import signals
from .fr import WordlistFr, DefinitionFr, AttachmentFr
from .jp import WordlistJp, DefinitionJp, AttachmentJp
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType
from .base import User

View File

@ -67,7 +67,14 @@ class DefinitionJp(Model):
word = fields.ForeignKeyField("models.WordlistJp", related_name="definitions", on_delete=fields.CASCADE)
meaning = fields.TextField(description="单词释义")
example = fields.TextField(null=True, description="单词例句")
pos = fields.CharEnumField(PosEnumJp, max_length=30, null=True)
pos = fields.ManyToManyField("models.PosType", related_name="definitions", on_delete=fields.CASCADE)
class Meta:
table = "definitions_jp"
class PosType(Model):
id = fields.IntField(pk=True)
pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False)
class Meta:
table = "pos_type"

View File

@ -37,10 +37,22 @@ class PosEnumJp(str, Enum):
noun = "名词"
adj = "形容词"
adj_v = "形容动词"
adv = "连用"
v1 = "一段动词"
v5 = "五段动词"
help = "助词"
self = "自动词"
other = "他动词"
tail = "接尾"
self_other = "自他动词"
follow = "接续"
habit = "惯用"
excl = "感叹词"
ka_v = "カ変"
sa_v = "サ変"
conn = "连体"
quantity = "量词"
pron = "代词"
class CreateWord(BaseModel):
word: str

View File

@ -1,5 +1,5 @@
from pydantic import BaseModel
from typing import Literal
from typing import Literal, Optional
default_portrait_url = '#'
@ -46,9 +46,9 @@ class UserOut(BaseModel):
class UpdateUserRequest(BaseModel):
current_password: str
new_username: str
new_password: str
current_password: Optional[str] = None
new_username: Optional[str] = None
new_password: Optional[str] = None
new_language: Literal["jp", "fr", "private"] = "private"
# lang_pref: str = "jp"

View File

@ -1,3 +1,5 @@
fugashi
pykakasi
aerich==0.9.1
aiosqlite==0.21.0
annotated-types==0.7.0

223
scripts/update_jp.py Normal file
View File

@ -0,0 +1,223 @@
import asyncio
import re
import unicodedata
import jaconv
from pathlib import Path
import pandas as pd
from fugashi import Tagger
from pykakasi import kakasi
from tortoise import Tortoise
from tortoise.exceptions import MultipleObjectsReturned
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
from settings import TORTOISE_ORM
xlsx_name = "./DictTable-20250823.xlsx"
xlsx_path = Path(xlsx_name)
def normalize_jp_text(text: str) -> str:
# Unicode标准化全角半角统一
text = unicodedata.normalize("NFKC", text)
# 去除前后空格和常见不可见字符(零宽空格、换行符、制表符等)
text = re.sub(r'[\u200b-\u200f\u202a-\u202e\r\n\t]', '', text)
return text.strip()
async def pos_process(pos: str):
# 映射简写到标准枚举值
mapping = {
"": "形容词",
"形动": "形容动词",
"感叹": "感叹词",
"连体词": "连体",
"副词": "连用",
"自动": "自动词",
"他动": "他动词",
"自他动": "自他动词",
"五段": "五段动词",
"一段": "一段动词",
"接续词": "接续",
"カ变动词": "カ変",
"サ变动词": "サ変",
}
# 去除空格 + 替换映射
pos_list = [mapping.get(p.strip(), p.strip()) for p in pos.split("") if p.strip()]
if "动词" in pos_list:
return None, True
# 查询匹配的 PosType 实例(建议加 set 去重)
pos_type_objs = await PosType.filter(pos_type__in=list(set(pos_list)))
return pos_type_objs, False
# 初始化分词器
tagger = Tagger()
# 初始化 kakasi 转换器
kakasi_inst = kakasi()
kakasi_inst.setMode("H", "a") # 平假名 to 罗马字
kakasi_inst.setMode("K", "a") # 片假名 to 罗马字
kakasi_inst.setMode("J", "a") # 汉字按音读转假名再转罗马字
kakasi_inst.setMode("r", "Hepburn") # 使用 Hepburn 拼音规则
converter = kakasi_inst.getConverter()
def is_kana_only(text: str) -> bool:
"""
判断是否是纯假名不含汉字
"""
for ch in text:
if not ('\u3040' <= ch <= '\u309F' or '\u30A0' <= ch <= '\u30FF'):
return False
return True
def to_kana(word: str) -> str:
# 如果全为假名,则直接返回
if is_kana_only(word):
return word
# 否则用 fugashi 分词并拼接假名(假设用 `feature.kana`
tokens = tagger(word)
kana_list = []
for token in tokens:
# token.feature.kana 是 Unidic 词典中的假名字段
kana = token.feature.get('kana') or token.surface
kana_list.append(kana)
return ''.join(kana_list)
def kana_to_romaji(text: str) -> str:
"""
将日文文本转换为罗马字假名优先汉字使用Unidic读音推测
"""
# 用fugashi解析词并取其假名
kana_seq = []
for word in tagger(text):
kana = word.feature[7] if len(word.feature) > 7 and word.feature[7] else word.surface
kana_seq.append(kana)
joined_kana = ''.join(kana_seq)
romaji = converter.do(joined_kana)
return romaji
async def import_wordlist_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word_obj, created = await WordlistJp.get_or_create(text=word, defaults={"freq": 0})
if created and word == 'また':
print(f"✅ 新增词条: {word}")
elif not created:
print(f"⚠️ 已存在: {word},跳过")
else:
pass
async def import_def_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word = str(word).strip()
try:
cls_word = await WordlistJp.get(text=word)
except MultipleObjectsReturned:
ids = await WordlistJp.filter(text=word).values_list("id", flat=True)
print(f"❗ 重复单词 {word}id为: {' '.join(str(i) for i in ids)}")
continue
except Exception as e:
print(f"❌ 查找单词 {word} 出错: {e}")
continue
# 字段处理
example = None if pd.isna(row.日语例句1) else normalize_jp_text(str(row.日语例句1))
if not pd.isna(row.词性):
pos_obj, jump = await pos_process(str(row.词性))
if jump:
continue
else:
print(f"{word} 的词性为空,跳过")
continue
chi_exp = str(row[4]).strip()
exists = await DefinitionJp.filter(
word=cls_word,
meaning=chi_exp,
).exists()
if exists:
print(f"⚠️ 已存在释义,跳过:{word} - {chi_exp[:10]}...")
continue
try:
new_item = await DefinitionJp.create(
word=cls_word,
meaning=chi_exp,
example=example,
)
await new_item.pos.add(*pos_obj)
print(f"✅ 导入释义:{word}")
except Exception as e:
print(f"❌ 插入释义失败:{word},错误: {e}")
async def import_attachment(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
# 统一清洗后去重词汇列表
unique_words = df["单词"].dropna().map(lambda x: normalize_jp_text(str(x))).unique().tolist()
# 批量获取所有 WordlistJp 实例
word_objs = await WordlistJp.filter(text__in=unique_words)
word_map = {normalize_jp_text(w.text): w for w in word_objs}
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word_obj = word_map.get(word)
if not word_obj:
print(f"❌ 未找到词条:{word},跳过")
print(f"[DEBUG] 原始: {repr(row.单词)} → 标准化后: {normalize_jp_text(str(row.单词))}")
print(f"编码: {[hex(ord(c)) for c in str(row.单词)]}")
continue
hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2]))
romaji = kana_to_romaji(word)
await AttachmentJp.get_or_create(
word=word_obj,
hiragana=hiragana,
romaji=romaji,
)
async def main():
await Tortoise.init(config=TORTOISE_ORM)
# await DefinitionJp.all().delete() # TRUNCATE TABLE definitions_fr;
# await WordlistJp.all().delete()
# await AttachmentJp.all().delete()
# await import_wordlist_jp()
# await import_def_jp()
await import_attachment()
if __name__ == '__main__':
asyncio.run(main())