import numpy as np from pypinyin import pinyin import pandas as pd set_word_lsit = [ '乱' , '乳' , '乾' , '了' , '予' , '争' , '事' , '二' ] # 转为拼音 word_to_pinyin = { i : pinyin ( i )[ 0 ][ 0 ] for i in set_word_lsit [ 29 :]
from pypinyin import pinyin
import pandas as pd
set_word_lsit=['乱', '乳', '乾', '了', '予', '争', '事', '二']
# 转为拼音
word_to_pinyin={i:pinyin(i)[0][0] for i in set_word_lsit[29:]}
pinyin_df=pd.DataFrame({"hanzi":word_to_pinyin.keys(),"pinyin":word_to_pinyin.values()})
# 统计同音字
pinyin_df["tongyin"]=""
for k,v in pinyin_df.groupby("pinyin"):
if v.shape[0]>1:
tri_value = np.array([v["hanzi"].values.tolist()] * v.shape[0])
tri_value[range(v.shape[0]), range(v.shape[0])] = ""
pinyin_df["tongyin"][pinyin_df["pinyin"] == k] = [ "".join(i) for i in tri_value[tri_value != ""].reshape(v.shape[0],-1)]
else:
pinyin_df["tongyin"][pinyin_df["pinyin"] == k] = [v["hanzi"].values.tolist()] * v.shape[0]
# 得到同音df