settings.py:

-更新用户名(由于数据库连接更新)
-更新了多数据库连接(预留prod_db备用)
backfill_search_text.py:
-统一回填search_text脚本
signals.py:
-后续加入内容时自动处理text为search_text
./scripts:
-数据库导入脚本
This commit is contained in:
Miyamizu-MitsuhaSang 2025-08-17 16:19:53 +08:00
parent 264315ae9d
commit fde510803e
14 changed files with 210 additions and 140 deletions

View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" /> <excludeFolder url="file://$MODULE_DIR$/.venv" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.12 (dict_server)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.12 (dict_server) (2)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

View File

@ -3,5 +3,5 @@
<component name="Black"> <component name="Black">
<option name="sdkName" value="Python 3.12 (dict_server)" /> <option name="sdkName" value="Python 3.12 (dict_server)" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (dict_server)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (dict_server) (2)" project-jdk-type="Python SDK" />
</project> </project>

View File

@ -6,7 +6,7 @@ import redis.asyncio as redis
from app.models.base import ReservedWords, User, Language from app.models.base import ReservedWords, User, Language
from app.utils.security import verify_password, hash_password, validate_password, validate_username, get_current_user from app.utils.security import verify_password, hash_password, validate_password, validate_username, get_current_user
from settings import SECRET_KEY from settings import settings
from app.core.redis import get_redis from app.core.redis import get_redis
from app.schemas.user_schemas import UserIn, UserOut, UpdateUserRequest, UserLoginRequest from app.schemas.user_schemas import UserIn, UserOut, UpdateUserRequest, UserLoginRequest
@ -67,7 +67,7 @@ async def user_login(user_in: UserLoginRequest):
"is_admin": user.is_admin, "is_admin": user.is_admin,
} }
token = jwt.encode(payload, SECRET_KEY, algorithm="HS256") token = jwt.encode(payload, settings.SECRET_KEY, algorithm="HS256")
return { return {
"access_token": token, "access_token": token,

View File

@ -12,42 +12,17 @@ sheet_name_fr = "法英中释义"
class WordlistFr(Model): class WordlistFr(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
language = fields.CharField(max_length=20, description="单词语种")
text = fields.CharField(max_length=40, unique=True, description="单词") text = fields.CharField(max_length=40, unique=True, description="单词")
definitions = fields.ReverseRelation("DefinitionFr") definitions: fields.ReverseRelation["DefinitionFr"]
attachments = fields.ReverseRelation("AttachmentsFr") attachments: fields.ReverseRelation["AttachmentFr"]
freq = fields.IntField() # 词频排序用
search_text = fields.CharField(max_length=255, index=True) # 检索字段
# attachment = fields.ForeignKeyField("models.Attachment", related_name="wordlists", on_delete=fields.CASCADE) # attachment = fields.ForeignKeyField("models.Attachment", related_name="wordlists", on_delete=fields.CASCADE)
# source = fields.CharField(max_length=20, description="<UNK>", null=True) # source = fields.CharField(max_length=20, description="<UNK>", null=True)
class Meta: class Meta:
table = "wordlist_fr" table = "wordlist_fr"
T = TypeVar("T", bound=Model)
@classmethod
async def update_or_create(cls: Type[T], **kwargs) -> Tuple[T, bool]:
print("传入参数为:", kwargs)
if not kwargs:
raise ValueError("必须提供至少一个字段作为参数")
created: bool = False
# 使用 kwargs 中第一个字段作为查找条件
first_key = next(iter(kwargs))
lookup = {first_key: kwargs[first_key]}
word = await cls.filter(**lookup).first() # 参数展开语法
if word:
for k, v in kwargs.items():
if k != first_key:
setattr(word, k, v)
await word.save()
else:
await cls.create(**kwargs)
created = True
return word, created
class AttachmentFr(Model): class AttachmentFr(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
@ -63,91 +38,10 @@ class AttachmentFr(Model):
class DefinitionFr(Model): class DefinitionFr(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
word = fields.ForeignKeyField("models.WordlistFr", related_name="definitions", on_delete=fields.CASCADE) word = fields.ForeignKeyField("models.WordlistFr", related_name="definitions", on_delete=fields.CASCADE)
pos = fields.CharEnumField(PosEnumFr, max_length=30) # ✅ 把词性放在释义层面 pos = fields.CharEnumField(PosEnumFr, max_length=30, null=True) # ✅ 把词性放在释义层面
meaning = fields.TextField(description="单词释义") # 如:“学习” meaning = fields.TextField(description="单词释义") # 如:“学习”
example = fields.TextField(null=True, description="单词例句") example = fields.TextField(null=True, description="单词例句")
eng_explanation = fields.TextField(null=True, description="English explanation") eng_explanation = fields.TextField(null=True, description="English explanation")
class Meta: class Meta:
table = "definitions_fr" table = "definitions_fr"
@classmethod
async def init_from_xlsx(
cls,
filepath: str,
sheet_name: str
):
"""
Initiate the database from xlsx file. Only read in data without checking
whether the content already exists.
:param filepath: receive both relative or absolute path
:param sheet_name: specific sheet name inside the .xlsx file
:return: None
"""
df = pd.read_excel(filepath, sheet_name=sheet_name, na_filter=True)
df.columns = [col.strip() for col in df.columns]
df.dropna(how="all", inplace=True)
# create_cnt = 0
DEF_COUNT = 1
for row in df.itertuples():
word = row.单词
cls_word = await WordlistFr.filter(text=word).first()
if cls_word is None:
print(f"未找到 word: {word}")
continue
pos = getattr(row, f"词性{DEF_COUNT}")
if pd.isna(pos):
continue
meaning = getattr(row, f"中文释义{DEF_COUNT}")
eng_exp = getattr(row, f"英语释义{DEF_COUNT}")
await DefinitionFr.create(
part_of_speech=pos,
meaning=meaning,
eng_explanation=eng_exp,
word=cls_word
)
# TODO revise the function (check update or create by id)
@classmethod
async def update_or_create_meaning(
cls,
word_obj,
target_language_obj,
part_of_speech: str,
meaning: str,
example: str = None,
eng_explanation: str = None,
) -> tuple["DefinitionFr", bool]:
"""
查询某个单词是否已有该释义依据四元组作为唯一标识存在则更新不存在则新增
返回(对象, 是否为新创建)
"""
query = {
"word": word_obj,
"target_language": target_language_obj,
"part_of_speech": part_of_speech,
"meaning": meaning
}
obj = await cls.filter(**query).first()
created = False
if obj:
# 可更新其他字段
obj.example = example
obj.eng_explanation = eng_explanation
await obj.save()
else:
obj = await cls.create(
word=word_obj,
target_language=target_language_obj,
part_of_speech=part_of_speech,
meaning=meaning,
example=example,
eng_explanation=eng_explanation,
)
created = True
return obj, created

View File

@ -16,8 +16,8 @@ sheet_name_jp = "日汉释义"
class WordlistJp(Model): class WordlistJp(Model):
id = fields.IntField(pk=True) id = fields.IntField(pk=True)
text = fields.CharField(max_length=40, description="单词") text = fields.CharField(max_length=40, description="单词")
definitions = fields.ReverseRelation("DefinitionJp") definitions : fields.ReverseRelation["DefinitionJp"]
attachments = fields.ReverseRelation("AttachmentsJp") attachments : fields.ReverseRelation["AttachmentJp"]
class Meta: class Meta:
table = "wordlist_jp" table = "wordlist_jp"

41
app/models/signals.py Normal file
View File

@ -0,0 +1,41 @@
from tortoise.signals import pre_save
from tortoise import BaseDBAsyncClient
from typing import Optional
from app.utils.textnorm import normalize_text
from app.models.fr import WordlistFr
@pre_save(WordlistFr)
async def wordlist_fr_pre_save(
sender: type[WordlistFr],
instance: WordlistFr,
using_db: BaseDBAsyncClient,
update_fields: Optional[list[str]]
) -> None:
"""
仅当 text 变更时同步 search_text
- 新建总是写入 search_text
- 修改只有当 text 在本次更新范围内 text 实际发生变化时才更新
- 若调用方用了 update_fields只包含 text则自动把 'search_text' 追加进去确保写回
"""
desired = normalize_text(instance.text or "")
# 不变则不写,减少无谓 UPDATE
if instance.search_text == desired:
return
# 情况 1完整更新没有传 update_fields
if update_fields is None:
instance.search_text = desired
return # ✅ 会写入
# 情况 2部分更新——只有当这次确实更新了 text才同步 search_text
if "text" in update_fields:
instance.search_text = desired
# update_fields 可能是 tuple转成 list 再补充
fields = list(update_fields)
if "search_text" not in fields:
fields.append("search_text")
# 交还给 ORM确保此次 UPDATE 包含 search_text
instance._update_fields = fields
# 否则(这次没更 text不动 search_text

View File

@ -3,10 +3,6 @@ from enum import Enum
from pydantic import BaseModel, validator, field_validator, Field from pydantic import BaseModel, validator, field_validator, Field
from typing import Optional, Literal, List from typing import Optional, Literal, List
from tortoise.exceptions import DoesNotExist
from app.models.fr import WordlistFr
class PosEnumFr(str, Enum): class PosEnumFr(str, Enum):
# noun # noun
@ -21,6 +17,10 @@ class PosEnumFr(str, Enum):
v_i = "v.i." v_i = "v.i."
v_pr = "v.pr." v_pr = "v.pr."
v_t_i = "v.t./v.i." v_t_i = "v.t./v.i."
v_t_dir = "v.t.dir."
v_t_ind = "v.t.ind."
v_t_pr = "v.t.(v.pr.)"
v_i_ind = "v.t.ind./v.i."
adj = "adj." # adj adj = "adj." # adj
adv = "adv." # adv adv = "adv." # adv
@ -29,6 +29,8 @@ class PosEnumFr(str, Enum):
conj = "conj." conj = "conj."
interj = "interj." interj = "interj."
chauff = "chauff" chauff = "chauff"
art = "art."
class PosEnumJp(str, Enum): class PosEnumJp(str, Enum):
@ -55,18 +57,18 @@ class CreateWord(BaseModel):
@classmethod @classmethod
@field_validator("eng_explanation") @field_validator("eng_explanation")
def validate_eng_explanation(cls, v): def validate_eng_explanation(cls, v):
if cls.language is "jp" and v: if cls.language == "jp" and v:
raise ValueError("Japanese word has no English explanation") raise ValueError("Japanese word has no English explanation")
if cls.language is "fr" and v is None or v == "": if cls.language == "fr" and v is None or v == "":
raise ValueError("French word must have English explanation") raise ValueError("French word must have English explanation")
return v return v
@classmethod @classmethod
@field_validator("pos") @field_validator("pos")
def validate_pos(cls, v): def validate_pos(cls, v):
if cls.language is "fr" and v not in PosEnumFr: if cls.language == "fr" and v not in PosEnumFr:
raise ValueError("Pos is not a valid type") raise ValueError("Pos is not a valid type")
if cls.language is "jp" and v not in PosEnumJp: if cls.language == "jp" and v not in PosEnumJp:
raise ValueError("Pos is not a valid type") raise ValueError("Pos is not a valid type")
return v return v

23
app/utils/textnorm.py Normal file
View File

@ -0,0 +1,23 @@
import re
import unicodedata
def normalize_text(s: str) -> str:
"""
规范化字符串用于搜索/存储 search_text
- Unicode 标准化
- 去除重音符号é -> e
- 转小写
- 去掉前后空格多空格合并
"""
if not s:
return ""
# 1. Unicode 标准化NFKD 拆分)
s = unicodedata.normalize("NFKD", s)
# 2. 去掉音标/重音符
s = "".join(ch for ch in s if not unicodedata.combining(ch))
# 3. 转小写
s = s.lower()
# 4. 去掉首尾空格 & 合并多个空格
s = re.sub(r"\s+", " ", s.strip())
return s

View File

@ -8,6 +8,7 @@ from settings import TORTOISE_ORM
from app.api.users import users_router from app.api.users import users_router
from app.api.admin.router import admin_router from app.api.admin.router import admin_router
from app.core.redis import init_redis_pool from app.core.redis import init_redis_pool
import app.models.signals
@asynccontextmanager @asynccontextmanager

Binary file not shown.

0
scripts/__init__.py Normal file
View File

View File

@ -0,0 +1,17 @@
import asyncio
from tortoise import Tortoise, run_async
from app.models.fr import WordlistFr
from app.utils.textnorm import normalize_text
from settings import TORTOISE_ORM
async def main():
await Tortoise.init(config=TORTOISE_ORM)
async for w in WordlistFr.all().only("id", "text", "search_text"): # type: WordlistFr
want = normalize_text(w.text)
if w.search_text != want:
w.search_text = want
await w.save(update_fields=["search_text"])
await Tortoise.close_connections()
if __name__ == "__main__":
run_async(main())

103
scripts/update_fr.py Normal file
View File

@ -0,0 +1,103 @@
import asyncio
from pathlib import Path
import pandas as pd
from tortoise import Tortoise
from tortoise.exceptions import MultipleObjectsReturned
from app.models.fr import DefinitionFr, WordlistFr
from settings import TORTOISE_ORM
import app.models.signals
xlsx_name = "./DictTable_20250811.xlsx"
xlsx_path = Path(xlsx_name)
def pos_process(pos: str) -> str:
pos = pos.replace(" ", "")
pos = pos.replace(",", "")
if not pos.endswith(".") and not pos.endswith(")") and pos != "chauff":
pos = pos + "."
return pos
async def import_wordlist_fr(path: Path = xlsx_path, sheet_name: str = "法英中释义"):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = str(row.单词).strip()
if pd.isna(word):
break
word_obj, created = await WordlistFr.get_or_create(text=word, defaults={"freq": 0})
if created:
print(f"✅ 新增词条: {word}")
else:
print(f"⚠️ 已存在: {word},跳过")
async def import_def_fr(
path: Path = xlsx_path,
sheet_name: str = "法英中释义"
):
df = pd.read_excel(path, sheet_name=sheet_name)
df.columns = [col.strip() for col in df.columns]
for row in df.itertuples():
word = row.单词
if pd.isna(word):
continue
word = str(word).strip()
# 查找 WordlistFr 实例(注意异常处理)
try:
cls_word = await WordlistFr.get(text=word)
except MultipleObjectsReturned:
ids = await WordlistFr.filter(text=word).values_list("id", flat=True)
print(f"❗ 重复单词 {word}id为: {' '.join(str(i) for i in ids)}")
continue
except Exception as e:
print(f"❌ 查找单词 {word} 出错: {e}")
continue
# 字段处理
example = None if pd.isna(row.法语例句1) else str(row.法语例句1).strip()
pos = None if pd.isna(row.词性1) else pos_process(str(row.词性1).strip())
eng_exp = None if pd.isna(row.英语释义1) else str(row.英语释义1).strip()
chi_exp = str(row[2]).strip()
# 去重:同一个词条不能有重复释义(同 pos + meaning
exists = await DefinitionFr.filter(
word=cls_word,
pos=pos,
meaning=chi_exp
).exists()
if exists:
print(f"⚠️ 已存在释义,跳过:{word} - {pos} - {chi_exp[:10]}...")
continue
# 创建定义
try:
await DefinitionFr.create(
word=cls_word,
pos=pos,
eng_explanation=eng_exp,
meaning=chi_exp,
example=example,
)
print(f"✅ 导入释义:{word} - {pos}")
except Exception as e:
print(f"❌ 插入释义失败:{word} - {pos},错误: {e}")
async def main():
await Tortoise.init(config=TORTOISE_ORM)
await DefinitionFr.all().delete()
await import_def_fr()
# await import_wordlist_fr()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -2,21 +2,8 @@ from pydantic.v1 import BaseSettings
TORTOISE_ORM = { TORTOISE_ORM = {
'connections': { 'connections': {
'default': { "default": "mysql://local_admin:enterprise@127.0.0.1:3306/dict",
# 'engine': 'tortoise.backends.asyncpg', PostgreSQL "production": "mysql://local_admin:enterprise@127.0.0.1:3306/prod_db",
'engine': 'tortoise.backends.mysql', # MySQL or Mariadb
'credentials': {
'host': '127.0.0.1',
'port': '3306',
'user': 'root',
'password': 'enterprise',
'database': 'dict',
'minsize': 1,
'maxsize': 5,
'charset': 'utf8mb4',
"echo": True
}
},
}, },
'apps': { 'apps': {
'models': { 'models': {
@ -34,8 +21,10 @@ TORTOISE_ORM = {
'timezone': 'Asia/Shanghai' 'timezone': 'Asia/Shanghai'
} }
class Settings(BaseSettings): class Settings(BaseSettings):
USE_OAUTH = False USE_OAUTH = False
SECRET_KEY = "asdasdasd-odjfnsodfnosidnfdf-0oq2j01j0jf0i1ej0fij10fd" SECRET_KEY = "asdasdasd-odjfnsodfnosidnfdf-0oq2j01j0jf0i1ej0fij10fd"
settings = Settings() settings = Settings()