keyan/test_textrank_en.py

161 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
words = {} # 存放的数据格式为(Key:String,relative_word:Array)
root_path = '..\\resources\\ACL2020'
invalid_word = stopwords.words('english')
# with open(r"D:\小工具程序\pdf2md\output_directory\good_i.mmd", "r", encoding="utf8") as f:
# lines = []
# for i in f.readlines():
# if i.strip():
# lines.append(i.strip())
# else:
# lines.append(" ")
# print("\n".join(lines))
# 获取单词的词性
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
def add_to_dict(word_list, windows=5):
valid_word_list = [] # 先进行过滤
for word in word_list:
word = str(word).lower()
if is_valid(word):
valid_word_list.append(word)
# 根据窗口进行关系建立
if len(valid_word_list) < windows:
win = valid_word_list
build_words_from_windows(win)
else:
index = 0
while index + windows <= len(valid_word_list):
win = valid_word_list[index:index + windows]
index += 1
build_words_from_windows(win)
# 根据小窗口将关系建立到words中
def build_words_from_windows(win):
for word in win:
if word not in words.keys():
words[word] = []
for other in win:
if other == word or other in words[word]:
continue
else:
words[word].append(other)
# 预处理,如果是False就丢掉
def is_valid(word):
if re.match("[()\-:;,.0-9]+", word) or word in invalid_word:
return False
elif len(word) < 4:
return False
else:
return True
def text_rank(d=0.85, max_iter=100):
min_diff = 0.05
words_weight = {} # {str,float)
for word in words.keys():
words_weight[word] = 1 / len(words.keys())
for i in range(max_iter):
n_words_weight = {} # {str,float)
max_diff = 0
for word in words.keys():
n_words_weight[word] = 1 - d
for other in words[word]:
if other == word or len(words[other]) == 0:
continue
n_words_weight[word] += d * words_weight[other] / len(words[other])
max_diff = max(n_words_weight[word] - words_weight[word], max_diff)
words_weight = n_words_weight
print('iter', i, 'max diff is', max_diff)
if max_diff < min_diff:
print('break with iter', i)
break
return words_weight
def read(path):
str = ''
with open(path, "r", encoding='UTF-8') as f: # 设置文件对象
# with open(root_path + "\\" + path, "r", encoding='UTF-8') as f: # 设置文件对象
lines = f.readlines() # 可以是随便对文件的操作
ready = False
for line in lines:
line = line.replace("#", "")
line = line.strip()
if line == '':
continue
elif line[-1] == '-':
str += line[:-1]
else:
str += line
if line == "References":
print('end read', line, " from ", path)
break
# print(str)
sens = sent_tokenize(str)
for sentence in sens:
# print(sentence)
tokens = word_tokenize(sentence) # 分词
tagged_sent = pos_tag(tokens) # 获取单词词性
wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
# print(lemmas_sent)
add_to_dict(lemmas_sent, 5)
def start(topN=20):
# files_name = os.listdir(root_path)
file_path = r"D:\小工具程序\pdf2md\output_directory\good_i.mmd"
files_name = [file_path]
# num = 0
for file_name in files_name:
# if file_name.endswith(".txt"):
# print(file_name)
read(file_name)
# num += 1
# if num > 2:
# break
words_weight = text_rank()
tmp = sorted(words_weight.items(), key=lambda x: x[1], reverse=True)
with open("method3_dict.txt", 'w', encoding="UTF-8") as f:
for i in range(topN):
f.write(tmp[i][0] + ' ' + str(tmp[i][1]) + '\n')
print(tmp[i])
# print(words_weight)
if __name__ == '__main__':
start()