import undetected_chromedriver as uc import time import random import json import matplotlib.pyplot as plt # 数据可视化 import jieba # 词语切割 import wordcloud # 分词 from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS # 词云,颜色生成器,停止词 import numpy as np # 科学计算 from PIL import Image # 处理图片 from bs4 import BeautifulSoup from lxml import etree # def get_current_page_result(driver): # """ 采集一页里的所有item """ # result_area = driver.find_element(by="id", value="ModuleSearchResult") # current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr') # # names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results] # links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results] # # items = get_items(driver, links) # return items def get_items(driver, links): items = [] for i, l in enumerate(links): item = get_item(driver, l) items.append(item) return items def get_item(driver, link): item = {} driver.get(link) # 获取新的论文链接 time.sleep(3 + 3 * random.random()) # 等等加载完成 # 标题 h1 = driver.find_element(by="xpath", value="//h1") item["name"] = h1.text # 作者 authors_area = driver.find_element(by="id", value="authorpart") authors = [a.text for a in authors_area.find_elements(by="xpath", value="span/a")] # .get_attribute("innerHTML") item["authors"] = authors # 单位 affiliations_area = driver.find_elements(by="xpath", value='//a[@class="author"]') affiliations = [affiliation.text for affiliation in affiliations_area] item["affiliations"] = affiliations # 摘要 # 如果有更多,先点更多 try: more_bn = driver.find_element(by="id", value="ChDivSummaryMore") more_bn.click() time.sleep(1 + 1 * random.random()) # 等等加载完成 except: more_bn = None abstract_area = driver.find_element(by="id", value="ChDivSummary") abstract = abstract_area.text item["abstract"] = abstract return item def get_links(driver): result_area = driver.find_element(by="id", value="ModuleSearchResult") current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr') # names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results] links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results] # 总报错,不知识原因 return links # [name_element.find_element(by="xpath", value="a").get_attribute("href") for name_element in names] # [name_element.find_element(by="xpath", value="a").text for name_element in names] def get_links_etree(driver): dom = etree.HTML(driver.page_source) links = dom.xpath('//table[@class="result-table-list"]//td[@class="name"]/a/@href') return links def get_news(total_num, keyword): driver = uc.Chrome() driver.get('https://www.cnki.net/') time.sleep(3 + 2 * random.random()) # 等等加载完成 # 搜索 input_button = driver.find_element(by="id", value="txt_SearchText") input_button.send_keys(keyword) time.sleep(1 + 1 * random.random()) # 等等加载完成 search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]') search_bn.click() time.sleep(5 + 3 * random.random()) # 等等加载完成 # 获取相应的链接 links = [] stop_flag = False while not stop_flag: link_current_page = get_links_etree(driver) links.extend(link_current_page) if len(links) < total_num: # 下一页 try: next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]') next_page_btn.click() time.sleep(2 + 2 * random.random()) # 等等加载完成 # driver.refresh() # time.sleep(2 + 2 * random.random()) # 等等加载完成 except Exception as e: print("没有下一页,返回当前的采集的所有结果", e) stop_flag = True total_num = len(links) else: # 超过了需要的连接数就停止 stop_flag = True links = links[:total_num] results = get_items(driver, links) with open("result.json", "w", encoding="utf8") as f: f.write(json.dumps(results)) driver.close() return results def get_clouds(word_list): text = ",".join(word_list) wordlist = jieba.lcut(text) # 切割词语 space_list = ' '.join(wordlist) # 空格链接词语 # backgroud = np.array(Image.open('test1.jpg')) wc = WordCloud(width=400, height=300, background_color='white', mode='RGB', # mask=backgroud, # 添加蒙版,生成指定形状的词云,并且词云图的颜色可从蒙版里提取 max_words=200, stopwords=STOPWORDS.update(('老年人', "的", "中", 'in', 'of', 'for')), # 内置的屏蔽词,并添加自己设置的词语 font_path='C:\Windows\Fonts\STZHONGS.ttf', max_font_size=100, relative_scaling=0.6, # 设置字体大小与词频的关联程度为0.4 random_state=50, scale=2 ).generate(space_list) # image_color = ImageColorGenerator(backgroud) # 设置生成词云的颜色,如去掉这两行则字体为默认颜色 # wc.recolor(color_func=image_color) return wc.to_array()