keyan/test_textrank_en.py

161 lines
4.7 KiB
Python
Raw Normal View History

2024-06-17 14:04:28 +08:00
import os
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
words = {} # 存放的数据格式为(Key:String,relative_word:Array)
root_path = '..\\resources\\ACL2020'
invalid_word = stopwords.words('english')
# with open(r"D:\小工具程序\pdf2md\output_directory\good_i.mmd", "r", encoding="utf8") as f:
# lines = []
# for i in f.readlines():
# if i.strip():
# lines.append(i.strip())
# else:
# lines.append(" ")
# print("\n".join(lines))
# 获取单词的词性
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
def add_to_dict(word_list, windows=5):
valid_word_list = [] # 先进行过滤
for word in word_list:
word = str(word).lower()
if is_valid(word):
valid_word_list.append(word)
# 根据窗口进行关系建立
if len(valid_word_list) < windows:
win = valid_word_list
build_words_from_windows(win)
else:
index = 0
while index + windows <= len(valid_word_list):
win = valid_word_list[index:index + windows]
index += 1
build_words_from_windows(win)
# 根据小窗口将关系建立到words中
def build_words_from_windows(win):
for word in win:
if word not in words.keys():
words[word] = []
for other in win:
if other == word or other in words[word]:
continue
else:
words[word].append(other)
# 预处理,如果是False就丢掉
def is_valid(word):
if re.match("[()\-:;,.0-9]+", word) or word in invalid_word:
return False
elif len(word) < 4:
return False
else:
return True
def text_rank(d=0.85, max_iter=100):
min_diff = 0.05
words_weight = {} # {str,float)
for word in words.keys():
words_weight[word] = 1 / len(words.keys())
for i in range(max_iter):
n_words_weight = {} # {str,float)
max_diff = 0
for word in words.keys():
n_words_weight[word] = 1 - d
for other in words[word]:
if other == word or len(words[other]) == 0:
continue
n_words_weight[word] += d * words_weight[other] / len(words[other])
max_diff = max(n_words_weight[word] - words_weight[word], max_diff)
words_weight = n_words_weight
print('iter', i, 'max diff is', max_diff)
if max_diff < min_diff:
print('break with iter', i)
break
return words_weight
def read(path):
str = ''
with open(path, "r", encoding='UTF-8') as f: # 设置文件对象
# with open(root_path + "\\" + path, "r", encoding='UTF-8') as f: # 设置文件对象
lines = f.readlines() # 可以是随便对文件的操作
ready = False
for line in lines:
line = line.replace("#", "")
line = line.strip()
if line == '':
continue
elif line[-1] == '-':
str += line[:-1]
else:
str += line
if line == "References":
print('end read', line, " from ", path)
break
# print(str)
sens = sent_tokenize(str)
for sentence in sens:
# print(sentence)
tokens = word_tokenize(sentence) # 分词
tagged_sent = pos_tag(tokens) # 获取单词词性
wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
# print(lemmas_sent)
add_to_dict(lemmas_sent, 5)
def start(topN=20):
# files_name = os.listdir(root_path)
file_path = r"D:\小工具程序\pdf2md\output_directory\good_i.mmd"
files_name = [file_path]
# num = 0
for file_name in files_name:
# if file_name.endswith(".txt"):
# print(file_name)
read(file_name)
# num += 1
# if num > 2:
# break
words_weight = text_rank()
tmp = sorted(words_weight.items(), key=lambda x: x[1], reverse=True)
with open("method3_dict.txt", 'w', encoding="UTF-8") as f:
for i in range(topN):
f.write(tmp[i][0] + ' ' + str(tmp[i][1]) + '\n')
print(tmp[i])
# print(words_weight)
if __name__ == '__main__':
start()