keyan/utils.py

158 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import undetected_chromedriver as uc
import time
import random
import json
import matplotlib.pyplot as plt # 数据可视化
import jieba # 词语切割
import wordcloud # 分词
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS # 词云,颜色生成器,停止词
import numpy as np # 科学计算
from PIL import Image # 处理图片
from bs4 import BeautifulSoup
from lxml import etree
# def get_current_page_result(driver):
# """ 采集一页里的所有item """
# result_area = driver.find_element(by="id", value="ModuleSearchResult")
# current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr')
#
# names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results]
# links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results]
#
# items = get_items(driver, links)
# return items
def get_items(driver, links):
items = []
for i, l in enumerate(links):
item = get_item(driver, l)
items.append(item)
return items
def get_item(driver, link):
item = {}
driver.get(link) # 获取新的论文链接
time.sleep(3 + 3 * random.random()) # 等等加载完成
# 标题
h1 = driver.find_element(by="xpath", value="//h1")
item["name"] = h1.text
# 作者
authors_area = driver.find_element(by="id", value="authorpart")
authors = [a.text for a in authors_area.find_elements(by="xpath", value="span/a")] # .get_attribute("innerHTML")
item["authors"] = authors
# 单位
affiliations_area = driver.find_elements(by="xpath", value='//a[@class="author"]')
affiliations = [affiliation.text for affiliation in affiliations_area]
item["affiliations"] = affiliations
# 摘要
# 如果有更多,先点更多
try:
more_bn = driver.find_element(by="id", value="ChDivSummaryMore")
more_bn.click()
time.sleep(1 + 1 * random.random()) # 等等加载完成
except:
more_bn = None
abstract_area = driver.find_element(by="id", value="ChDivSummary")
abstract = abstract_area.text
item["abstract"] = abstract
return item
def get_links(driver):
result_area = driver.find_element(by="id", value="ModuleSearchResult")
current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr')
# names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results]
links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results] # 总报错,不知识原因
return links
# [name_element.find_element(by="xpath", value="a").get_attribute("href") for name_element in names]
# [name_element.find_element(by="xpath", value="a").text for name_element in names]
def get_links_etree(driver):
dom = etree.HTML(driver.page_source)
links = dom.xpath('//table[@class="result-table-list"]//td[@class="name"]/a/@href')
return links
def get_news(total_num, keyword):
driver = uc.Chrome()
driver.get('https://www.cnki.net/')
time.sleep(3 + 2 * random.random()) # 等等加载完成
# 搜索
input_button = driver.find_element(by="id", value="txt_SearchText")
input_button.send_keys(keyword)
time.sleep(1 + 1 * random.random()) # 等等加载完成
search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
search_bn.click()
time.sleep(5 + 3 * random.random()) # 等等加载完成
# 获取相应的链接
links = []
stop_flag = False
while not stop_flag:
link_current_page = get_links_etree(driver)
links.extend(link_current_page)
if len(links) < total_num:
# 下一页
try:
next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]')
next_page_btn.click()
time.sleep(2 + 2 * random.random()) # 等等加载完成
# driver.refresh()
# time.sleep(2 + 2 * random.random()) # 等等加载完成
except Exception as e:
print("没有下一页,返回当前的采集的所有结果", e)
stop_flag = True
total_num = len(links)
else:
# 超过了需要的连接数就停止
stop_flag = True
links = links[:total_num]
results = get_items(driver, links)
with open("result.json", "w", encoding="utf8") as f:
f.write(json.dumps(results))
driver.close()
return results
def get_clouds(word_list):
text = "".join(word_list)
wordlist = jieba.lcut(text) # 切割词语
space_list = ' '.join(wordlist) # 空格链接词语
# backgroud = np.array(Image.open('test1.jpg'))
wc = WordCloud(width=400, height=300,
background_color='white',
mode='RGB',
# mask=backgroud, # 添加蒙版,生成指定形状的词云,并且词云图的颜色可从蒙版里提取
max_words=200,
stopwords=STOPWORDS.update(('老年人', "", "", 'in', 'of', 'for')), # 内置的屏蔽词,并添加自己设置的词语
font_path='C:\Windows\Fonts\STZHONGS.ttf',
max_font_size=100,
relative_scaling=0.6, # 设置字体大小与词频的关联程度为0.4
random_state=50,
scale=2
).generate(space_list)
# image_color = ImageColorGenerator(backgroud) # 设置生成词云的颜色,如去掉这两行则字体为默认颜色
# wc.recolor(color_func=image_color)
return wc.to_array()