jp.py:
调整表结构以适配日语词典结构 requirements.txt: 新增依赖包 admin_schemas.py: 调整日语词性Enum
This commit is contained in:
parent
b4f3ba6c6a
commit
5b2e7890f8
|
|
@ -1,11 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||
<data-source source="LOCAL" name="dict@localhost" uuid="002414ae-d165-4dd5-b083-7d2a09fa7184">
|
||||
<data-source source="LOCAL" name="dict@127.0.0.1" uuid="002414ae-d165-4dd5-b083-7d2a09fa7184">
|
||||
<driver-ref>mysql.8</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
|
||||
<jdbc-url>jdbc:mysql://localhost:3306/dict</jdbc-url>
|
||||
<jdbc-url>jdbc:mysql://127.0.0.1:3306/dict</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
</data-source>
|
||||
<data-source source="LOCAL" name="test_db@124.221.145.135" uuid="a23b927c-ada0-4708-ba6e-4db2b9dab6c1">
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
<component name="SqlDialectMappings">
|
||||
<file url="file://$PROJECT_DIR$/migrations/models/11_20250803092810_update.py" dialect="MySQL" />
|
||||
<file url="file://$PROJECT_DIR$/migrations/models/16_20250806104900_update.py" dialect="GenericSQL" />
|
||||
<file url="file://$PROJECT_DIR$/migrations/models/21_20250827100315_update.py" dialect="MySQL" />
|
||||
<file url="file://$PROJECT_DIR$/scripts/update_fr.py" dialect="MySQL" />
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -33,8 +33,14 @@ async def register(user_in: UserIn):
|
|||
}
|
||||
|
||||
|
||||
@users_router.put("/update")
|
||||
@users_router.put("/update", deprecated=False)
|
||||
async def user_modification(updated_user: UpdateUserRequest, current_user: User = Depends(get_current_user)):
|
||||
"""
|
||||
|
||||
:param updated_user: Pydantic 模型验证修改内容(根据JSON内容修改对应字段)
|
||||
:param current_user:
|
||||
:return:
|
||||
"""
|
||||
reserved_words = await ReservedWords.filter(category="username").values_list("reserved", flat=True)
|
||||
# 验证当前密码
|
||||
if not await verify_password(updated_user.current_password, current_user.password_hash):
|
||||
|
|
@ -50,7 +56,6 @@ async def user_modification(updated_user: UpdateUserRequest, current_user: User
|
|||
if updated_user.new_password:
|
||||
current_user.password_hash = hash_password(updated_user.new_password)
|
||||
|
||||
|
||||
@users_router.post("/login")
|
||||
async def user_login(user_in: UserLoginRequest):
|
||||
user = await User.get_or_none(name=user_in.name)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from . import signals
|
||||
from .fr import WordlistFr, DefinitionFr, AttachmentFr
|
||||
from .jp import WordlistJp, DefinitionJp, AttachmentJp
|
||||
from .jp import WordlistJp, DefinitionJp, AttachmentJp, PosType
|
||||
from .base import User
|
||||
|
|
@ -67,7 +67,14 @@ class DefinitionJp(Model):
|
|||
word = fields.ForeignKeyField("models.WordlistJp", related_name="definitions", on_delete=fields.CASCADE)
|
||||
meaning = fields.TextField(description="单词释义")
|
||||
example = fields.TextField(null=True, description="单词例句")
|
||||
pos = fields.CharEnumField(PosEnumJp, max_length=30, null=True)
|
||||
pos = fields.ManyToManyField("models.PosType", related_name="definitions", on_delete=fields.CASCADE)
|
||||
|
||||
class Meta:
|
||||
table = "definitions_jp"
|
||||
|
||||
class PosType(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
pos_type = fields.CharEnumField(PosEnumJp, max_length=30, null=False)
|
||||
|
||||
class Meta:
|
||||
table = "pos_type"
|
||||
|
|
|
|||
|
|
@ -37,10 +37,22 @@ class PosEnumJp(str, Enum):
|
|||
noun = "名词"
|
||||
adj = "形容词"
|
||||
adj_v = "形容动词"
|
||||
adv = "连用"
|
||||
v1 = "一段动词"
|
||||
v5 = "五段动词"
|
||||
help = "助词"
|
||||
|
||||
self = "自动词"
|
||||
other = "他动词"
|
||||
tail = "接尾"
|
||||
self_other = "自他动词"
|
||||
follow = "接续"
|
||||
habit = "惯用"
|
||||
excl = "感叹词"
|
||||
ka_v = "カ変"
|
||||
sa_v = "サ変"
|
||||
conn = "连体"
|
||||
quantity = "量词"
|
||||
pron = "代词"
|
||||
|
||||
class CreateWord(BaseModel):
|
||||
word: str
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
from typing import Literal, Optional
|
||||
|
||||
default_portrait_url = '#'
|
||||
|
||||
|
|
@ -46,9 +46,9 @@ class UserOut(BaseModel):
|
|||
|
||||
|
||||
class UpdateUserRequest(BaseModel):
|
||||
current_password: str
|
||||
new_username: str
|
||||
new_password: str
|
||||
current_password: Optional[str] = None
|
||||
new_username: Optional[str] = None
|
||||
new_password: Optional[str] = None
|
||||
new_language: Literal["jp", "fr", "private"] = "private"
|
||||
|
||||
# lang_pref: str = "jp"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
fugashi
|
||||
pykakasi
|
||||
aerich==0.9.1
|
||||
aiosqlite==0.21.0
|
||||
annotated-types==0.7.0
|
||||
|
|
|
|||
|
|
@ -0,0 +1,223 @@
|
|||
import asyncio
|
||||
import re
|
||||
import unicodedata
|
||||
import jaconv
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from fugashi import Tagger
|
||||
from pykakasi import kakasi
|
||||
from tortoise import Tortoise
|
||||
from tortoise.exceptions import MultipleObjectsReturned
|
||||
|
||||
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
|
||||
from settings import TORTOISE_ORM
|
||||
|
||||
xlsx_name = "./DictTable-20250823.xlsx"
|
||||
xlsx_path = Path(xlsx_name)
|
||||
|
||||
|
||||
def normalize_jp_text(text: str) -> str:
|
||||
# Unicode标准化(全角半角统一)
|
||||
text = unicodedata.normalize("NFKC", text)
|
||||
# 去除前后空格和常见不可见字符(零宽空格、换行符、制表符等)
|
||||
text = re.sub(r'[\u200b-\u200f\u202a-\u202e\r\n\t]', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
async def pos_process(pos: str):
|
||||
# 映射简写到标准枚举值
|
||||
mapping = {
|
||||
"形": "形容词",
|
||||
"形动": "形容动词",
|
||||
"感叹": "感叹词",
|
||||
"连体词": "连体",
|
||||
"副词": "连用",
|
||||
"自动": "自动词",
|
||||
"他动": "他动词",
|
||||
"自他动": "自他动词",
|
||||
"五段": "五段动词",
|
||||
"一段": "一段动词",
|
||||
"接续词": "接续",
|
||||
"カ变动词": "カ変",
|
||||
"サ变动词": "サ変",
|
||||
}
|
||||
|
||||
# 去除空格 + 替换映射
|
||||
pos_list = [mapping.get(p.strip(), p.strip()) for p in pos.split("・") if p.strip()]
|
||||
if "动词" in pos_list:
|
||||
return None, True
|
||||
|
||||
# 查询匹配的 PosType 实例(建议加 set 去重)
|
||||
pos_type_objs = await PosType.filter(pos_type__in=list(set(pos_list)))
|
||||
return pos_type_objs, False
|
||||
|
||||
|
||||
# 初始化分词器
|
||||
tagger = Tagger()
|
||||
|
||||
# 初始化 kakasi 转换器
|
||||
kakasi_inst = kakasi()
|
||||
kakasi_inst.setMode("H", "a") # 平假名 to 罗马字
|
||||
kakasi_inst.setMode("K", "a") # 片假名 to 罗马字
|
||||
kakasi_inst.setMode("J", "a") # 汉字按音读转假名再转罗马字
|
||||
kakasi_inst.setMode("r", "Hepburn") # 使用 Hepburn 拼音规则
|
||||
converter = kakasi_inst.getConverter()
|
||||
|
||||
|
||||
def is_kana_only(text: str) -> bool:
|
||||
"""
|
||||
判断是否是纯假名(不含汉字)
|
||||
"""
|
||||
for ch in text:
|
||||
if not ('\u3040' <= ch <= '\u309F' or '\u30A0' <= ch <= '\u30FF'):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def to_kana(word: str) -> str:
|
||||
# 如果全为假名,则直接返回
|
||||
if is_kana_only(word):
|
||||
return word
|
||||
|
||||
# 否则用 fugashi 分词并拼接假名(假设用 `feature.kana`)
|
||||
tokens = tagger(word)
|
||||
kana_list = []
|
||||
for token in tokens:
|
||||
# token.feature.kana 是 Unidic 词典中的假名字段
|
||||
kana = token.feature.get('kana') or token.surface
|
||||
kana_list.append(kana)
|
||||
|
||||
return ''.join(kana_list)
|
||||
|
||||
|
||||
def kana_to_romaji(text: str) -> str:
|
||||
"""
|
||||
将日文文本转换为罗马字(假名优先,汉字使用Unidic读音推测)
|
||||
"""
|
||||
|
||||
# 用fugashi解析词并取其假名
|
||||
kana_seq = []
|
||||
for word in tagger(text):
|
||||
kana = word.feature[7] if len(word.feature) > 7 and word.feature[7] else word.surface
|
||||
kana_seq.append(kana)
|
||||
|
||||
joined_kana = ''.join(kana_seq)
|
||||
romaji = converter.do(joined_kana)
|
||||
return romaji
|
||||
|
||||
|
||||
async def import_wordlist_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
for row in df.itertuples():
|
||||
word = normalize_jp_text(str(row.单词))
|
||||
if pd.isna(word):
|
||||
continue
|
||||
|
||||
word_obj, created = await WordlistJp.get_or_create(text=word, defaults={"freq": 0})
|
||||
if created and word == 'また':
|
||||
print(f"✅ 新增词条: {word}")
|
||||
elif not created:
|
||||
print(f"⚠️ 已存在: {word},跳过")
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
async def import_def_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
for row in df.itertuples():
|
||||
word = normalize_jp_text(str(row.单词))
|
||||
if pd.isna(word):
|
||||
continue
|
||||
|
||||
word = str(word).strip()
|
||||
|
||||
try:
|
||||
cls_word = await WordlistJp.get(text=word)
|
||||
except MultipleObjectsReturned:
|
||||
ids = await WordlistJp.filter(text=word).values_list("id", flat=True)
|
||||
print(f"❗ 重复单词 {word},id为: {' '.join(str(i) for i in ids)}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"❌ 查找单词 {word} 出错: {e}")
|
||||
continue
|
||||
|
||||
# 字段处理
|
||||
example = None if pd.isna(row.日语例句1) else normalize_jp_text(str(row.日语例句1))
|
||||
if not pd.isna(row.词性):
|
||||
pos_obj, jump = await pos_process(str(row.词性))
|
||||
if jump:
|
||||
continue
|
||||
else:
|
||||
print(f"❌ {word} 的词性为空,跳过")
|
||||
continue
|
||||
chi_exp = str(row[4]).strip()
|
||||
|
||||
exists = await DefinitionJp.filter(
|
||||
word=cls_word,
|
||||
meaning=chi_exp,
|
||||
).exists()
|
||||
if exists:
|
||||
print(f"⚠️ 已存在释义,跳过:{word} - {chi_exp[:10]}...")
|
||||
continue
|
||||
|
||||
try:
|
||||
new_item = await DefinitionJp.create(
|
||||
word=cls_word,
|
||||
meaning=chi_exp,
|
||||
example=example,
|
||||
)
|
||||
await new_item.pos.add(*pos_obj)
|
||||
print(f"✅ 导入释义:{word}")
|
||||
except Exception as e:
|
||||
print(f"❌ 插入释义失败:{word},错误: {e}")
|
||||
|
||||
|
||||
async def import_attachment(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
# 统一清洗后去重词汇列表
|
||||
unique_words = df["单词"].dropna().map(lambda x: normalize_jp_text(str(x))).unique().tolist()
|
||||
|
||||
# 批量获取所有 WordlistJp 实例
|
||||
word_objs = await WordlistJp.filter(text__in=unique_words)
|
||||
word_map = {normalize_jp_text(w.text): w for w in word_objs}
|
||||
|
||||
for row in df.itertuples():
|
||||
word = normalize_jp_text(str(row.单词))
|
||||
if pd.isna(word):
|
||||
continue
|
||||
word_obj = word_map.get(word)
|
||||
if not word_obj:
|
||||
print(f"❌ 未找到词条:{word},跳过")
|
||||
print(f"[DEBUG] 原始: {repr(row.单词)} → 标准化后: {normalize_jp_text(str(row.单词))}")
|
||||
print(f"编码: {[hex(ord(c)) for c in str(row.单词)]}")
|
||||
continue
|
||||
|
||||
hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2]))
|
||||
romaji = kana_to_romaji(word)
|
||||
|
||||
await AttachmentJp.get_or_create(
|
||||
word=word_obj,
|
||||
hiragana=hiragana,
|
||||
romaji=romaji,
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
await Tortoise.init(config=TORTOISE_ORM)
|
||||
# await DefinitionJp.all().delete() # TRUNCATE TABLE definitions_fr;
|
||||
# await WordlistJp.all().delete()
|
||||
# await AttachmentJp.all().delete()
|
||||
# await import_wordlist_jp()
|
||||
# await import_def_jp()
|
||||
await import_attachment()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
Loading…
Reference in New Issue