242 lines
7.6 KiB
Python
242 lines
7.6 KiB
Python
import asyncio
|
||
import re
|
||
import unicodedata
|
||
import jaconv
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
from fugashi import Tagger
|
||
from pykakasi import kakasi
|
||
from tortoise import Tortoise
|
||
from tortoise.exceptions import MultipleObjectsReturned
|
||
|
||
from app.models import WordlistJp, DefinitionJp, AttachmentJp, PosType
|
||
from settings import TORTOISE_ORM
|
||
|
||
xlsx_name = "./DictTable-20250823.xlsx"
|
||
xlsx_path = Path(xlsx_name)
|
||
|
||
|
||
def normalize_jp_text(text: str) -> str:
|
||
# Unicode标准化(全角半角统一)
|
||
text = unicodedata.normalize("NFKC", text)
|
||
# 去除前后空格和常见不可见字符(零宽空格、换行符、制表符等)
|
||
text = re.sub(r'[\u200b-\u200f\u202a-\u202e\r\n\t]', '', text)
|
||
return text.strip()
|
||
|
||
|
||
async def pos_process(pos: str):
|
||
# 映射简写到标准枚举值
|
||
mapping = {
|
||
"形": "形容词",
|
||
"形动": "形容动词",
|
||
"感叹": "感叹词",
|
||
"连体词": "连体",
|
||
"副词": "连用",
|
||
"自动": "自动词",
|
||
"他动": "他动词",
|
||
"自他动": "自他动词",
|
||
"五段": "五段动词",
|
||
"一段": "一段动词",
|
||
"接续词": "接续",
|
||
"カ变动词": "カ変",
|
||
"サ变动词": "サ変",
|
||
}
|
||
|
||
# 去除空格 + 替换映射
|
||
pos_list = [mapping.get(p.strip(), p.strip()) for p in pos.split("・") if p.strip()]
|
||
if "动词" in pos_list:
|
||
return None, True
|
||
|
||
# 查询匹配的 PosType 实例(建议加 set 去重)
|
||
pos_type_objs = await PosType.filter(pos_type__in=list(set(pos_list)))
|
||
return pos_type_objs, False
|
||
|
||
|
||
# 初始化分词器
|
||
tagger = Tagger()
|
||
|
||
# 初始化 kakasi 转换器
|
||
kakasi_inst = kakasi()
|
||
kakasi_inst.setMode("H", "a") # 平假名 to 罗马字
|
||
kakasi_inst.setMode("K", "a") # 片假名 to 罗马字
|
||
kakasi_inst.setMode("J", "a") # 汉字按音读转假名再转罗马字
|
||
kakasi_inst.setMode("r", "Hepburn") # 使用 Hepburn 拼音规则
|
||
converter = kakasi_inst.getConverter()
|
||
|
||
|
||
def is_kana_only(text: str) -> bool:
|
||
"""
|
||
判断是否是纯假名(不含汉字)
|
||
"""
|
||
for ch in text:
|
||
if not ('\u3040' <= ch <= '\u309F' or '\u30A0' <= ch <= '\u30FF'):
|
||
return False
|
||
return True
|
||
|
||
|
||
def to_kana(word: str) -> str:
|
||
# 如果全为假名,则直接返回
|
||
if is_kana_only(word):
|
||
return word
|
||
|
||
# 否则用 fugashi 分词并拼接假名(假设用 `feature.kana`)
|
||
tokens = tagger(word)
|
||
kana_list = []
|
||
for token in tokens:
|
||
# token.feature.kana 是 Unidic 词典中的假名字段
|
||
kana = token.feature.get('kana') or token.surface
|
||
kana_list.append(kana)
|
||
|
||
return ''.join(kana_list)
|
||
|
||
|
||
def kana_to_romaji(text: str) -> str:
|
||
"""
|
||
将日文文本转换为罗马字(假名优先,汉字使用Unidic读音推测)
|
||
"""
|
||
|
||
# 用fugashi解析词并取其假名
|
||
kana_seq = []
|
||
for word in tagger(text):
|
||
kana = word.feature[7] if len(word.feature) > 7 and word.feature[7] else word.surface
|
||
kana_seq.append(kana)
|
||
|
||
joined_kana = ''.join(kana_seq)
|
||
romaji = converter.do(joined_kana)
|
||
return romaji
|
||
|
||
|
||
async def import_wordlist_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||
df.columns = [col.strip() for col in df.columns]
|
||
|
||
for row in df.itertuples():
|
||
word = normalize_jp_text(str(row.单词))
|
||
if pd.isna(word):
|
||
continue
|
||
|
||
word_obj, created = await WordlistJp.get_or_create(text=word, defaults={"freq": 0})
|
||
if created and word == 'また':
|
||
print(f"✅ 新增词条: {word}")
|
||
elif not created:
|
||
print(f"⚠️ 已存在: {word},跳过")
|
||
else:
|
||
pass
|
||
|
||
|
||
async def import_def_jp(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||
df.columns = [col.strip() for col in df.columns]
|
||
|
||
for row in df.itertuples():
|
||
word = normalize_jp_text(str(row.单词))
|
||
if pd.isna(word):
|
||
continue
|
||
|
||
word = str(word).strip()
|
||
|
||
try:
|
||
cls_word = await WordlistJp.get(text=word)
|
||
except MultipleObjectsReturned:
|
||
ids = await WordlistJp.filter(text=word).values_list("id", flat=True)
|
||
print(f"❗ 重复单词 {word},id为: {' '.join(str(i) for i in ids)}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"❌ 查找单词 {word} 出错: {e}")
|
||
continue
|
||
|
||
if pd.isna(row[6]):
|
||
continue
|
||
# 字段处理
|
||
example = None if pd.isna(row.日语例句2) else normalize_jp_text(str(row.日语例句2))
|
||
if not pd.isna(row.词性):
|
||
pos_obj, jump = await pos_process(str(row.词性))
|
||
if jump:
|
||
continue
|
||
else:
|
||
print(f"❌ {word} 的词性为空,跳过")
|
||
continue
|
||
chi_exp = str(row[6]).strip() # 读取第二个释义
|
||
|
||
exists = await DefinitionJp.filter(
|
||
word=cls_word,
|
||
meaning=chi_exp,
|
||
).exists()
|
||
if exists:
|
||
print(f"⚠️ 已存在释义,跳过:{word} - {chi_exp[:10]}...")
|
||
continue
|
||
|
||
try:
|
||
new_item = await DefinitionJp.create(
|
||
word=cls_word,
|
||
meaning=chi_exp,
|
||
example=example,
|
||
)
|
||
await new_item.pos.add(*pos_obj)
|
||
print(f"✅ 导入释义:{word}")
|
||
except Exception as e:
|
||
print(f"❌ 插入释义失败:{word},错误: {e}")
|
||
|
||
|
||
async def import_attachment(path: Path = xlsx_path, sheet_name: str = "日汉释义"):
|
||
df = pd.read_excel(path, sheet_name=sheet_name)
|
||
df.columns = [col.strip() for col in df.columns]
|
||
|
||
# 统一清洗后去重词汇列表
|
||
unique_words = df["单词"].dropna().map(lambda x: normalize_jp_text(str(x))).unique().tolist()
|
||
|
||
# 批量获取所有 WordlistJp 实例
|
||
word_objs = await WordlistJp.filter(text__in=unique_words)
|
||
word_map = {normalize_jp_text(w.text): w for w in word_objs}
|
||
|
||
for row in df.itertuples():
|
||
word = normalize_jp_text(str(row.单词))
|
||
if pd.isna(word):
|
||
continue
|
||
word_obj = word_map.get(word)
|
||
if not word_obj:
|
||
print(f"❌ 未找到词条:{word},跳过")
|
||
print(f"[DEBUG] 原始: {repr(row.单词)} → 标准化后: {normalize_jp_text(str(row.单词))}")
|
||
print(f"编码: {[hex(ord(c)) for c in str(row.单词)]}")
|
||
continue
|
||
|
||
hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2]))
|
||
romaji = jaconv.kana2alphabet(hiragana)
|
||
|
||
await AttachmentJp.get_or_create(
|
||
word=word_obj,
|
||
hiragana=hiragana,
|
||
romaji=romaji,
|
||
)
|
||
|
||
|
||
async def set_hiragana(xlsx_path: Path = xlsx_path, sheet_name : str="日汉释义"):
|
||
df = pd.read_excel(xlsx_path)
|
||
df.columns = [col.strip() for col in df.columns]
|
||
|
||
for row in df.itertuples():
|
||
word = normalize_jp_text(str(row[1]).strip())
|
||
if pd.isna(word):
|
||
break
|
||
|
||
hiragana = normalize_jp_text(jaconv.kata2hira(str(row[1]))) if pd.isna(row[2]) else normalize_jp_text(str(row[2]))
|
||
romaji = row[3]
|
||
|
||
await WordlistJp.filter(text=word).update(hiragana=hiragana)
|
||
|
||
|
||
async def main():
|
||
await Tortoise.init(config=TORTOISE_ORM)
|
||
# await DefinitionJp.all().delete() # TRUNCATE TABLE definitions_fr;
|
||
# await WordlistJp.all().delete()
|
||
# await AttachmentJp.all().delete()
|
||
# await import_wordlist_jp()
|
||
# await import_def_jp()
|
||
# await import_attachment()
|
||
await set_hiragana()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
asyncio.run(main())
|