# coding='utf-8' from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from selenium.webdriver.common.alert import Alert from selenium.webdriver.common.action_chains import ActionChains import time as t import re from bs4 import BeautifulSoup import xlrd import xlwt import os import undetected_chromedriver as uc # 先进入浏览器知网 driver = uc.Chrome() # driver.minimize_window() # 浏览器窗口最小化,只显示dos窗口 driver.get('https://www.cnki.net/') keywords = ["对抗攻击"] # 选到“关键词所在的”li # //a[text()='关键词']/.. # a = driver.find_element(by="xpath", value="//a[text()='关键词']/..") # driver.execute_script("arguments[0].className = 'cur';", a) # 找到input input_button = driver.find_element(by="id", value="txt_SearchText") input_button.send_keys("对抗攻击") search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]') search_bn.click() result_area = driver.find_element(by="id", value="ModuleSearchResult") current_page_resluts = result_area.find_elements(by="xpath", value='//*[@id="ModuleSearchResult"]//tbody/tr') names = [r.find_element(by="xpath", value='//td[@class="name"]') for r in current_page_resluts] links = [r.find_element(by="xpath", value='//td[@class="name"]/a').get_attribute("href") for r in current_page_resluts] driver.get(links[0]) # 获取新的论文链接‘ # 下一页 //a[contains(text(), "下一页")] next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]') next_page_btn.click() def cut(list, n): """将列表按特定数量切分成小列表""" for i in range(0, len(list), n): yield list[i:i + n] def clear(old_list, new_list): """用于清洗出纯文本""" for i in old_list: n = (i.text).strip() n = n.replace('\n', ' ') new_list.append(n) return new_list def clear_jou(old_list, new_list): """用于清洗出期刊的纯文本""" for i in old_list: n = (i.text).strip() n = n.replace('\n', ' ') new_list.append(n) return new_list def clear_ab(old_list, new_list): """用于清洗出摘要的纯文本""" for i in old_list: n = (i.text).strip() n = n.replace('\n', '') n = n.replace('摘要:', '') n = n.replace(' ', '') new_list.append(n) return new_list def clear_c(old_list, new_list): """用于清洗出被引数的纯文本""" for i in old_list: n = str(i) n = n.replace('\n', '') new_list.append(i) return new_list def clear_d(old_list, new_list): """用于清洗出下载量的纯文本""" for i in old_list: n = (i.text).strip() n = n.replace('\n', ' ') n = int(n) new_list.append(n) return new_list def extract(inpath): """取出基金号""" data = xlrd.open_workbook(inpath, encoding_override='utf-8') table = data.sheets()[0] # 选定表 nrows = table.nrows # 获取行号 ncols = table.ncols # 获取列号 numbers = [] for i in range(1, nrows): # 第0行为表头 alldata = table.row_values(i) # 循环输出excel表中每一行,即所有数据 result = alldata[4] # 取出表中第一列数据 numbers.append(result) return numbers def save_afile(alls, keywords, file): os.chdir(r'F:\图情社科基金项目数据爬取\论文信息') # 进入要保存的文件夹 """将一个基金的论文数据保存在一个excel""" f = xlwt.Workbook() sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) sheet1.write(0, 0, '题目') sheet1.write(0, 1, '发表期刊') sheet1.write(0, 2, '出版时间') sheet1.write(0, 3, '摘要') i = 1 for all in alls: # 遍历每一页 for data in all: # 遍历每一行 for j in range(len(data)): # 取每一单元格 sheet1.write(i, j, data[j]) # 写入单元格 i = i + 1 # 往下一行 f.save(file + '.xls') # 保存关键词为txt file = open(file + '.txt', 'w') for key in keywords: file.write(str(key)) file.write('\n') file.close() def get_html(number, count_number): """火狐模拟并获得当前源码 第一个是网址self.url,第二个是基金号,需要导入基金号列表 """ """火狐模拟并获得当前源码 第一个是基金号,第二个是计数器 """ s_2 = '/html/body/div[4]/div/div[2]/div[1]/input[1]' s_1 = '//*[@id="txt_SearchText"]' if count_number == 0: element = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div/div[1]/span') # 鼠标悬浮 ActionChains(driver).move_to_element(element).perform() t.sleep(2) driver.find_element_by_link_text(u'基金').click() # 选中为基金检索模式 driver.find_element_by_xpath(s_1).send_keys(str(number)) # 键入基金号 driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/input[2]').click() # 进行搜索 else: driver.find_element_by_xpath(s_2).clear() # 清除内容 driver.find_element_by_xpath(s_2).send_keys(str(number)) # 键入基金号 driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div[1]/input[2]').click() # 进行搜索 t.sleep(2) try: driver.find_element_by_css_selector('#DivDisplayMode > li:nth-child(1)').click() # 选中为详情,如果有问题,需要设置为断点 t.sleep(5) html_now = driver.page_source # 页面源码 print('ok!') except: html_now = '下一个' finally: return html_now def pull(html): """提取一页的论文条目、关键词和当前页面数""" soup = BeautifulSoup(html, 'html.parser') # 解析器:html.parser try: page = soup.select('.countPageMark') # 页面计数 count = page[0].text except: count = 1 title = soup.select('.middle>h6>a') titles = [] # 纯标题 clear(title, titles) journal = soup.select('.middle p.baseinfo span a ') # 期刊名 date = soup.select('.middle p.baseinfo span.date') # 发表时间 journals_o = [] # 取出字符 journals = [] # 最终结果 clear_jou(journal, journals_o) for i in journals_o: if i.isdigit(): # 如果该项为数字 pass else: journals.append(i) dates = [] clear(date, dates) abstract = soup.select('.abstract') # 摘要 abstracts = [] clear_ab(abstract, abstracts) keyword = soup.select('.keywords>a') # 关键词 keywords = [] clear(keyword, keywords) page = [] # 除了关键词的所有信息 for i in range(len(titles)): page.append(titles[i:i + 1] + journals[i:i + 1] + dates[i:i + 1] + abstracts[i:i + 1]) return page, keywords, count def one_n_save(fund, count_number): """保存一个基金号的相关数据""" alls = [] # 一个基金的所有页面 keywords = [] # 一个基金的所有关键词 all, key_words, count = pull(get_html(str(fund), count_number)) # 第一页的数据 count = str(count) count = count.replace('1/', '') alls.append(all) # 存储第一页的数据 keywords.append(key_words) # 存储第一页的关键词 t.sleep(5) # 一个基金的大部分数据,关键词,页数 while True: if 1 < int(count) < 3: # 只有两页 t.sleep(5) try: driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页 except: driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页 t.sleep(5) html_a = driver.page_source # 当前页面源码 all, key_words, count_1 = pull(html_a) alls.append(all) # 存储当页的数据 keywords.append(key_words) break elif int(count) >= 3: # 大于两页 t.sleep(5) try: driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页 except: driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页 t.sleep(5) html_a = driver.page_source # 当前页面源码 all, key_words, count_2 = pull(html_a) alls.append(all) # 存储当页的数据 keywords.append(key_words) for i in range(int(count) - 2): # 翻几次页 t.sleep(5) try: driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页 except: driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[4]').click() # 点击翻页 t.sleep(5) html_a = driver.page_source # 当前页面源码 all, key_words, count_go = pull(html_a) alls.append(all) # 存储当页的数据 keywords.append(key_words) break else: break save_afile(alls, keywords, str(fund)) print("成功!") # inpath = '列表.xlsx'#excel文件所在路径 # ns=extract(inpath)#基金号列表 count_number = 0 # 只能存储有论文的 # i = '14BTQ073' # 单个基金号的论文元数据爬取,多个遍历即可 # for i in ns: one_n_save(i, count_number) # 保存这一基金号的 print(str(i) + '基金号的所有论文基本信息保存完毕!') # 显示成功信息 # count_number=count_number+1 driver.quit() # 关闭浏览器 print('Over!') # 全部完成 # 本程序仅能自动获取有论文的情况 # 出现了被引数错误的情况——clear_c有问题 # 出现了下载数出现在被引数的情况——获取被引数和下载量有问题 # 出现了事实上下载量和被引数都没有但写入到excel的情况,定位同上 # 决定放弃被引数和下载量的爬取 # 将被引数和下载量放在另一个程序中