161 lines
4.7 KiB
Python
161 lines
4.7 KiB
Python
import os
|
||
import re
|
||
|
||
from nltk import word_tokenize, pos_tag
|
||
from nltk.corpus import stopwords
|
||
from nltk.corpus import wordnet
|
||
from nltk.stem import WordNetLemmatizer
|
||
from nltk.tokenize import sent_tokenize
|
||
|
||
words = {} # 存放的数据格式为(Key:String,relative_word:Array)
|
||
root_path = '..\\resources\\ACL2020'
|
||
invalid_word = stopwords.words('english')
|
||
|
||
# with open(r"D:\小工具程序\pdf2md\output_directory\good_i.mmd", "r", encoding="utf8") as f:
|
||
# lines = []
|
||
# for i in f.readlines():
|
||
# if i.strip():
|
||
# lines.append(i.strip())
|
||
# else:
|
||
# lines.append(" ")
|
||
# print("\n".join(lines))
|
||
|
||
# 获取单词的词性
|
||
def get_wordnet_pos(tag):
|
||
if tag.startswith('J'):
|
||
return wordnet.ADJ
|
||
elif tag.startswith('V'):
|
||
return wordnet.VERB
|
||
elif tag.startswith('N'):
|
||
return wordnet.NOUN
|
||
elif tag.startswith('R'):
|
||
return wordnet.ADV
|
||
else:
|
||
return None
|
||
|
||
|
||
def add_to_dict(word_list, windows=5):
|
||
valid_word_list = [] # 先进行过滤
|
||
|
||
for word in word_list:
|
||
word = str(word).lower()
|
||
if is_valid(word):
|
||
valid_word_list.append(word)
|
||
|
||
# 根据窗口进行关系建立
|
||
if len(valid_word_list) < windows:
|
||
win = valid_word_list
|
||
build_words_from_windows(win)
|
||
else:
|
||
index = 0
|
||
while index + windows <= len(valid_word_list):
|
||
win = valid_word_list[index:index + windows]
|
||
index += 1
|
||
build_words_from_windows(win)
|
||
|
||
|
||
# 根据小窗口,将关系建立到words中
|
||
def build_words_from_windows(win):
|
||
for word in win:
|
||
if word not in words.keys():
|
||
words[word] = []
|
||
for other in win:
|
||
if other == word or other in words[word]:
|
||
continue
|
||
else:
|
||
words[word].append(other)
|
||
|
||
|
||
# 预处理,如果是False就丢掉
|
||
def is_valid(word):
|
||
if re.match("[()\-:;,.0-9]+", word) or word in invalid_word:
|
||
return False
|
||
elif len(word) < 4:
|
||
return False
|
||
else:
|
||
return True
|
||
|
||
|
||
def text_rank(d=0.85, max_iter=100):
|
||
min_diff = 0.05
|
||
words_weight = {} # {str,float)
|
||
for word in words.keys():
|
||
words_weight[word] = 1 / len(words.keys())
|
||
for i in range(max_iter):
|
||
n_words_weight = {} # {str,float)
|
||
max_diff = 0
|
||
for word in words.keys():
|
||
n_words_weight[word] = 1 - d
|
||
for other in words[word]:
|
||
if other == word or len(words[other]) == 0:
|
||
continue
|
||
n_words_weight[word] += d * words_weight[other] / len(words[other])
|
||
max_diff = max(n_words_weight[word] - words_weight[word], max_diff)
|
||
words_weight = n_words_weight
|
||
print('iter', i, 'max diff is', max_diff)
|
||
if max_diff < min_diff:
|
||
print('break with iter', i)
|
||
break
|
||
return words_weight
|
||
|
||
|
||
def read(path):
|
||
str = ''
|
||
with open(path, "r", encoding='UTF-8') as f: # 设置文件对象
|
||
# with open(root_path + "\\" + path, "r", encoding='UTF-8') as f: # 设置文件对象
|
||
lines = f.readlines() # 可以是随便对文件的操作
|
||
ready = False
|
||
for line in lines:
|
||
line = line.replace("#", "")
|
||
line = line.strip()
|
||
if line == '':
|
||
continue
|
||
elif line[-1] == '-':
|
||
str += line[:-1]
|
||
else:
|
||
str += line
|
||
|
||
if line == "References":
|
||
print('end read', line, " from ", path)
|
||
break
|
||
|
||
# print(str)
|
||
sens = sent_tokenize(str)
|
||
for sentence in sens:
|
||
# print(sentence)
|
||
tokens = word_tokenize(sentence) # 分词
|
||
tagged_sent = pos_tag(tokens) # 获取单词词性
|
||
|
||
wnl = WordNetLemmatizer()
|
||
lemmas_sent = []
|
||
for tag in tagged_sent:
|
||
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
|
||
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
|
||
# print(lemmas_sent)
|
||
add_to_dict(lemmas_sent, 5)
|
||
|
||
|
||
def start(topN=20):
|
||
# files_name = os.listdir(root_path)
|
||
file_path = r"D:\小工具程序\pdf2md\output_directory\good_i.mmd"
|
||
files_name = [file_path]
|
||
# num = 0
|
||
for file_name in files_name:
|
||
# if file_name.endswith(".txt"):
|
||
# print(file_name)
|
||
read(file_name)
|
||
# num += 1
|
||
# if num > 2:
|
||
# break
|
||
words_weight = text_rank()
|
||
tmp = sorted(words_weight.items(), key=lambda x: x[1], reverse=True)
|
||
with open("method3_dict.txt", 'w', encoding="UTF-8") as f:
|
||
for i in range(topN):
|
||
f.write(tmp[i][0] + ' ' + str(tmp[i][1]) + '\n')
|
||
print(tmp[i])
|
||
# print(words_weight)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
start()
|