dict-server/scripts/update_jp.py

224 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import re
import unicodedata
import jaconv
from pathlib import Path
import pandas as pd
from fugashi import Tagger
from pykakasi import kakasi
from tortoise import Tortoise
from tortoise.exceptions import MultipleObjectsReturned
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
from settings import TORTOISE_ORM
xlsx_name = "./DictTable-20250823.xlsx"
xlsx_path = Path(xlsx_name)
def normalize_jp_text(text: str) -> str:
# Unicode标准化全角半角统一
text = unicodedata.normalize("NFKC", text)
# 去除前后空格和常见不可见字符(零宽空格、换行符、制表符等)
text = re.sub(r'[\u200b-\u200f\u202a-\u202e\r\n\t]', '', text)
return text.strip()
async def pos_process(pos: str):
# 映射简写到标准枚举值
mapping = {
"": "形容词",
"形动": "形容动词",
"感叹": "感叹词",
"连体词": "连体",
"副词": "连用",
"自动": "自动词",
"他动": "他动词",
"自他动": "自他动词",
"五段": "五段动词",
"一段": "一段动词",
"接续词": "接续",
"カ变动词": "カ変",
"サ变动词": "サ変",
}
# 去除空格 + 替换映射
pos_list = [mapping.get(p.strip(), p.strip()) for p in pos.split("") if p.strip()]
if "动词" in pos_list:
return None, True
# 查询匹配的 PosType 实例(建议加 set 去重)
pos_type_objs = await PosType.filter(pos_type__in=list(set(pos_list)))
return pos_type_objs, False
# 初始化分词器
tagger = Tagger()
# 初始化 kakasi 转换器
kakasi_inst = kakasi()
kakasi_inst.setMode("H", "a") # 平假名 to 罗马字
kakasi_inst.setMode("K", "a") # 片假名 to 罗马字
kakasi_inst.setMode("J", "a") # 汉字按音读转假名再转罗马字
kakasi_inst.setMode("r", "Hepburn") # 使用 Hepburn 拼音规则
converter = kakasi_inst.getConverter()
def is_kana_only(text: str) -> bool:
"""
判断是否是纯假名(不含汉字)
"""
for ch in text:
if not ('\u3040' <= ch <= '\u309F' or '\u30A0' <= ch <= '\u30FF'):
return False
return True
def to_kana(word: str) -> str:
# 如果全为假名,则直接返回
if is_kana_only(word):
return word
# 否则用 fugashi 分词并拼接假名(假设用 `feature.kana`
tokens = tagger(word)
kana_list = []
for token in tokens:
# token.feature.kana 是 Unidic 词典中的假名字段
kana = token.feature.get('kana') or token.surface
kana_list.append(kana)
return ''.join(kana_list)
def kana_to_romaji(text: str) -> str:
"""
将日文文本转换为罗马字假名优先汉字使用Unidic读音推测
"""
# 用fugashi解析词并取其假名
kana_seq = []
for word in tagger(text):
kana = word.feature[7] if len(word.feature) > 7 and word.feature[7] else word.surface
kana_seq.append(kana)
joined_kana = ''.join(kana_seq)
romaji = converter.do(joined_kana)
return romaji
async def import_wordlist_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word_obj, created = await WordlistJp.get_or_create(text=word, defaults={"freq": 0})
if created and word == 'また':
print(f"✅ 新增词条: {word}")
elif not created:
print(f"⚠️ 已存在: {word},跳过")
else:
pass
async def import_def_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word = str(word).strip()
try:
cls_word = await WordlistJp.get(text=word)
except MultipleObjectsReturned:
ids = await WordlistJp.filter(text=word).values_list("id", flat=True)
print(f"❗ 重复单词 {word}id为: {' '.join(str(i) for i in ids)}")
continue
except Exception as e:
print(f"❌ 查找单词 {word} 出错: {e}")
continue
# 字段处理
example = None if pd.isna(row.日语例句1) else normalize_jp_text(str(row.日语例句1))
if not pd.isna(row.词性):
pos_obj, jump = await pos_process(str(row.词性))
if jump:
continue
else:
print(f"{word} 的词性为空,跳过")
continue
chi_exp = str(row[4]).strip()
exists = await DefinitionJp.filter(
word=cls_word,
meaning=chi_exp,
).exists()
if exists:
print(f"⚠️ 已存在释义,跳过:{word} - {chi_exp[:10]}...")
continue
try:
new_item = await DefinitionJp.create(
word=cls_word,
meaning=chi_exp,
example=example,
)
await new_item.pos.add(*pos_obj)
print(f"✅ 导入释义:{word}")
except Exception as e:
print(f"❌ 插入释义失败:{word},错误: {e}")
async def import_attachment(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
# 统一清洗后去重词汇列表
unique_words = df["单词"].dropna().map(lambda x: normalize_jp_text(str(x))).unique().tolist()
# 批量获取所有 WordlistJp 实例
word_objs = await WordlistJp.filter(text__in=unique_words)
word_map = {normalize_jp_text(w.text): w for w in word_objs}
for row in df.itertuples():
word = normalize_jp_text(str(row.单词))
if pd.isna(word):
continue
word_obj = word_map.get(word)
if not word_obj:
print(f"❌ 未找到词条:{word},跳过")
print(f"[DEBUG] 原始: {repr(row.单词)} → 标准化后: {normalize_jp_text(str(row.单词))}")
print(f"编码: {[hex(ord(c)) for c in str(row.单词)]}")
continue
hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2]))
romaji = jaconv.kana2alphabet(hiragana)
await AttachmentJp.get_or_create(
word=word_obj,
hiragana=hiragana,
romaji=romaji,
)
async def main():
await Tortoise.init(config=TORTOISE_ORM)
# await DefinitionJp.all().delete() # TRUNCATE TABLE definitions_fr;
# await WordlistJp.all().delete()
# await AttachmentJp.all().delete()
# await import_wordlist_jp()
# await import_def_jp()
await import_attachment()
if __name__ == '__main__':
asyncio.run(main())