158 lines
5.6 KiB
Python
158 lines
5.6 KiB
Python
|
import undetected_chromedriver as uc
|
|||
|
import time
|
|||
|
import random
|
|||
|
import json
|
|||
|
import matplotlib.pyplot as plt # 数据可视化
|
|||
|
import jieba # 词语切割
|
|||
|
import wordcloud # 分词
|
|||
|
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS # 词云,颜色生成器,停止词
|
|||
|
import numpy as np # 科学计算
|
|||
|
from PIL import Image # 处理图片
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
from lxml import etree
|
|||
|
|
|||
|
|
|||
|
# def get_current_page_result(driver):
|
|||
|
# """ 采集一页里的所有item """
|
|||
|
# result_area = driver.find_element(by="id", value="ModuleSearchResult")
|
|||
|
# current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr')
|
|||
|
#
|
|||
|
# names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results]
|
|||
|
# links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results]
|
|||
|
#
|
|||
|
# items = get_items(driver, links)
|
|||
|
# return items
|
|||
|
|
|||
|
|
|||
|
def get_items(driver, links):
|
|||
|
items = []
|
|||
|
for i, l in enumerate(links):
|
|||
|
item = get_item(driver, l)
|
|||
|
items.append(item)
|
|||
|
return items
|
|||
|
|
|||
|
|
|||
|
def get_item(driver, link):
|
|||
|
item = {}
|
|||
|
driver.get(link) # 获取新的论文链接
|
|||
|
time.sleep(3 + 3 * random.random()) # 等等加载完成
|
|||
|
|
|||
|
# 标题
|
|||
|
h1 = driver.find_element(by="xpath", value="//h1")
|
|||
|
item["name"] = h1.text
|
|||
|
|
|||
|
# 作者
|
|||
|
authors_area = driver.find_element(by="id", value="authorpart")
|
|||
|
authors = [a.text for a in authors_area.find_elements(by="xpath", value="span/a")] # .get_attribute("innerHTML")
|
|||
|
item["authors"] = authors
|
|||
|
|
|||
|
# 单位
|
|||
|
affiliations_area = driver.find_elements(by="xpath", value='//a[@class="author"]')
|
|||
|
affiliations = [affiliation.text for affiliation in affiliations_area]
|
|||
|
item["affiliations"] = affiliations
|
|||
|
|
|||
|
# 摘要
|
|||
|
# 如果有更多,先点更多
|
|||
|
try:
|
|||
|
more_bn = driver.find_element(by="id", value="ChDivSummaryMore")
|
|||
|
more_bn.click()
|
|||
|
time.sleep(1 + 1 * random.random()) # 等等加载完成
|
|||
|
except:
|
|||
|
more_bn = None
|
|||
|
|
|||
|
abstract_area = driver.find_element(by="id", value="ChDivSummary")
|
|||
|
abstract = abstract_area.text
|
|||
|
item["abstract"] = abstract
|
|||
|
|
|||
|
return item
|
|||
|
|
|||
|
|
|||
|
def get_links(driver):
|
|||
|
result_area = driver.find_element(by="id", value="ModuleSearchResult")
|
|||
|
current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr')
|
|||
|
|
|||
|
# names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results]
|
|||
|
links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results] # 总报错,不知识原因
|
|||
|
return links
|
|||
|
# [name_element.find_element(by="xpath", value="a").get_attribute("href") for name_element in names]
|
|||
|
# [name_element.find_element(by="xpath", value="a").text for name_element in names]
|
|||
|
|
|||
|
|
|||
|
def get_links_etree(driver):
|
|||
|
dom = etree.HTML(driver.page_source)
|
|||
|
links = dom.xpath('//table[@class="result-table-list"]//td[@class="name"]/a/@href')
|
|||
|
return links
|
|||
|
|
|||
|
|
|||
|
def get_news(total_num, keyword):
|
|||
|
driver = uc.Chrome()
|
|||
|
driver.get('https://www.cnki.net/')
|
|||
|
time.sleep(3 + 2 * random.random()) # 等等加载完成
|
|||
|
# 搜索
|
|||
|
input_button = driver.find_element(by="id", value="txt_SearchText")
|
|||
|
input_button.send_keys(keyword)
|
|||
|
time.sleep(1 + 1 * random.random()) # 等等加载完成
|
|||
|
|
|||
|
search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
|
|||
|
search_bn.click()
|
|||
|
time.sleep(5 + 3 * random.random()) # 等等加载完成
|
|||
|
|
|||
|
# 获取相应的链接
|
|||
|
links = []
|
|||
|
stop_flag = False
|
|||
|
|
|||
|
while not stop_flag:
|
|||
|
link_current_page = get_links_etree(driver)
|
|||
|
links.extend(link_current_page)
|
|||
|
|
|||
|
if len(links) < total_num:
|
|||
|
# 下一页
|
|||
|
try:
|
|||
|
next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]')
|
|||
|
next_page_btn.click()
|
|||
|
time.sleep(2 + 2 * random.random()) # 等等加载完成
|
|||
|
# driver.refresh()
|
|||
|
# time.sleep(2 + 2 * random.random()) # 等等加载完成
|
|||
|
except Exception as e:
|
|||
|
print("没有下一页,返回当前的采集的所有结果", e)
|
|||
|
stop_flag = True
|
|||
|
total_num = len(links)
|
|||
|
else:
|
|||
|
# 超过了需要的连接数就停止
|
|||
|
stop_flag = True
|
|||
|
|
|||
|
links = links[:total_num]
|
|||
|
|
|||
|
results = get_items(driver, links)
|
|||
|
|
|||
|
with open("result.json", "w", encoding="utf8") as f:
|
|||
|
f.write(json.dumps(results))
|
|||
|
|
|||
|
driver.close()
|
|||
|
return results
|
|||
|
|
|||
|
|
|||
|
def get_clouds(word_list):
|
|||
|
text = ",".join(word_list)
|
|||
|
wordlist = jieba.lcut(text) # 切割词语
|
|||
|
space_list = ' '.join(wordlist) # 空格链接词语
|
|||
|
# backgroud = np.array(Image.open('test1.jpg'))
|
|||
|
|
|||
|
wc = WordCloud(width=400, height=300,
|
|||
|
background_color='white',
|
|||
|
mode='RGB',
|
|||
|
# mask=backgroud, # 添加蒙版,生成指定形状的词云,并且词云图的颜色可从蒙版里提取
|
|||
|
max_words=200,
|
|||
|
stopwords=STOPWORDS.update(('老年人', "的", "中", 'in', 'of', 'for')), # 内置的屏蔽词,并添加自己设置的词语
|
|||
|
font_path='C:\Windows\Fonts\STZHONGS.ttf',
|
|||
|
max_font_size=100,
|
|||
|
relative_scaling=0.6, # 设置字体大小与词频的关联程度为0.4
|
|||
|
random_state=50,
|
|||
|
scale=2
|
|||
|
).generate(space_list)
|
|||
|
|
|||
|
# image_color = ImageColorGenerator(backgroud) # 设置生成词云的颜色,如去掉这两行则字体为默认颜色
|
|||
|
# wc.recolor(color_func=image_color)
|
|||
|
|
|||
|
return wc.to_array()
|