keyan/test_textrank_en.py

import os
import re

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

words = {}  # 存放的数据格式为(Key:String,relative_word:Array)
root_path = '..\\resources\\ACL2020'
invalid_word = stopwords.words('english')

# with open(r"D:\小工具程序\pdf2md\output_directory\good_i.mmd", "r", encoding="utf8") as f:
#     lines = []
#     for i in f.readlines():
#         if i.strip():
#             lines.append(i.strip())
#         else:
#             lines.append(" ")
# print("\n".join(lines))

# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def add_to_dict(word_list, windows=5):
    valid_word_list = []  # 先进行过滤

    for word in word_list:
        word = str(word).lower()
        if is_valid(word):
            valid_word_list.append(word)

    # 根据窗口进行关系建立
    if len(valid_word_list) < windows:
        win = valid_word_list
        build_words_from_windows(win)
    else:
        index = 0
        while index + windows <= len(valid_word_list):
            win = valid_word_list[index:index + windows]
            index += 1
            build_words_from_windows(win)


# 根据小窗口，将关系建立到words中
def build_words_from_windows(win):
    for word in win:
        if word not in words.keys():
            words[word] = []
        for other in win:
            if other == word or other in words[word]:
                continue
            else:
                words[word].append(other)


# 预处理,如果是False就丢掉
def is_valid(word):
    if re.match("[()\-:;,.0-9]+", word) or word in invalid_word:
        return False
    elif len(word) < 4:
        return False
    else:
        return True


def text_rank(d=0.85, max_iter=100):
    min_diff = 0.05
    words_weight = {}  # {str,float)
    for word in words.keys():
        words_weight[word] = 1 / len(words.keys())
    for i in range(max_iter):
        n_words_weight = {}  # {str,float)
        max_diff = 0
        for word in words.keys():
            n_words_weight[word] = 1 - d
            for other in words[word]:
                if other == word or len(words[other]) == 0:
                    continue
                n_words_weight[word] += d * words_weight[other] / len(words[other])
            max_diff = max(n_words_weight[word] - words_weight[word], max_diff)
        words_weight = n_words_weight
        print('iter', i, 'max diff is', max_diff)
        if max_diff < min_diff:
            print('break with iter', i)
            break
    return words_weight


def read(path):
    str = ''
    with open(path, "r", encoding='UTF-8') as f:  # 设置文件对象
    # with open(root_path + "\\" + path, "r", encoding='UTF-8') as f:  # 设置文件对象
        lines = f.readlines()  # 可以是随便对文件的操作
        ready = False
        for line in lines:
            line = line.replace("#", "")
            line = line.strip()
            if line == '':
                continue
            elif line[-1] == '-':
                str += line[:-1]
            else:
                str += line

            if line == "References":
                print('end read', line, " from ", path)
                break

        # print(str)
    sens = sent_tokenize(str)
    for sentence in sens:
        # print(sentence)
        tokens = word_tokenize(sentence)  # 分词
        tagged_sent = pos_tag(tokens)  # 获取单词词性

        wnl = WordNetLemmatizer()
        lemmas_sent = []
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))  # 词形还原
        # print(lemmas_sent)
        add_to_dict(lemmas_sent, 5)


def start(topN=20):
    # files_name = os.listdir(root_path)
    file_path = r"D:\小工具程序\pdf2md\output_directory\good_i.mmd"
    files_name = [file_path]
    # num = 0
    for file_name in files_name:
        # if file_name.endswith(".txt"):
        #     print(file_name)
        read(file_name)
            # num += 1
            # if num > 2:
            #     break
    words_weight = text_rank()
    tmp = sorted(words_weight.items(), key=lambda x: x[1], reverse=True)
    with open("method3_dict.txt", 'w', encoding="UTF-8") as f:
        for i in range(topN):
            f.write(tmp[i][0] + ' ' + str(tmp[i][1]) + '\n')
            print(tmp[i])
    # print(words_weight)


if __name__ == '__main__':
    start()