keyan/论文信息爬取(题目、期刊、日期、摘要、关键词)_1.py

283 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# coding='utf-8'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.action_chains import ActionChains
import time as t
import re
from bs4 import BeautifulSoup
import xlrd
import xlwt
import os
import undetected_chromedriver as uc
# 先进入浏览器知网
driver = uc.Chrome()
# driver.minimize_window() # 浏览器窗口最小化只显示dos窗口
driver.get('https://www.cnki.net/')
keywords = ["对抗攻击"]
# 选到“关键词所在的”li
# //a[text()='关键词']/..
# a = driver.find_element(by="xpath", value="//a[text()='关键词']/..")
# driver.execute_script("arguments[0].className = 'cur';", a)
# 找到input
input_button = driver.find_element(by="id", value="txt_SearchText")
input_button.send_keys("对抗攻击")
search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
search_bn.click()
result_area = driver.find_element(by="id", value="ModuleSearchResult")
current_page_resluts = result_area.find_elements(by="xpath", value='//*[@id="ModuleSearchResult"]//tbody/tr')
names = [r.find_element(by="xpath", value='//td[@class="name"]') for r in current_page_resluts]
links = [r.find_element(by="xpath", value='//td[@class="name"]/a').get_attribute("href") for r in current_page_resluts]
driver.get(links[0]) # 获取新的论文链接‘
# 下一页 //a[contains(text(), "下一页")]
next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]')
next_page_btn.click()
def cut(list, n):
"""将列表按特定数量切分成小列表"""
for i in range(0, len(list), n):
yield list[i:i + n]
def clear(old_list, new_list):
"""用于清洗出纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
new_list.append(n)
return new_list
def clear_jou(old_list, new_list):
"""用于清洗出期刊的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
new_list.append(n)
return new_list
def clear_ab(old_list, new_list):
"""用于清洗出摘要的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', '')
n = n.replace('摘要:', '')
n = n.replace(' ', '')
new_list.append(n)
return new_list
def clear_c(old_list, new_list):
"""用于清洗出被引数的纯文本"""
for i in old_list:
n = str(i)
n = n.replace('\n', '')
new_list.append(i)
return new_list
def clear_d(old_list, new_list):
"""用于清洗出下载量的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
n = int(n)
new_list.append(n)
return new_list
def extract(inpath):
"""取出基金号"""
data = xlrd.open_workbook(inpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
numbers = []
for i in range(1, nrows): # 第0行为表头
alldata = table.row_values(i) # 循环输出excel表中每一行即所有数据
result = alldata[4] # 取出表中第一列数据
numbers.append(result)
return numbers
def save_afile(alls, keywords, file):
os.chdir(r'F:\图情社科基金项目数据爬取\论文信息') # 进入要保存的文件夹
"""将一个基金的论文数据保存在一个excel"""
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet1.write(0, 0, '题目')
sheet1.write(0, 1, '发表期刊')
sheet1.write(0, 2, '出版时间')
sheet1.write(0, 3, '摘要')
i = 1
for all in alls: # 遍历每一页
for data in all: # 遍历每一行
for j in range(len(data)): # 取每一单元格
sheet1.write(i, j, data[j]) # 写入单元格
i = i + 1 # 往下一行
f.save(file + '.xls')
# 保存关键词为txt
file = open(file + '.txt', 'w')
for key in keywords:
file.write(str(key))
file.write('\n')
file.close()
def get_html(number, count_number):
"""火狐模拟并获得当前源码
第一个是网址self.url,第二个是基金号,需要导入基金号列表
"""
"""火狐模拟并获得当前源码
第一个是基金号,第二个是计数器
"""
s_2 = '/html/body/div[4]/div/div[2]/div[1]/input[1]'
s_1 = '//*[@id="txt_SearchText"]'
if count_number == 0:
element = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div/div[1]/span') # 鼠标悬浮
ActionChains(driver).move_to_element(element).perform()
t.sleep(2)
driver.find_element_by_link_text(u'基金').click() # 选中为基金检索模式
driver.find_element_by_xpath(s_1).send_keys(str(number)) # 键入基金号
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/input[2]').click() # 进行搜索
else:
driver.find_element_by_xpath(s_2).clear() # 清除内容
driver.find_element_by_xpath(s_2).send_keys(str(number)) # 键入基金号
driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div[1]/input[2]').click() # 进行搜索
t.sleep(2)
try:
driver.find_element_by_css_selector('#DivDisplayMode > li:nth-child(1)').click() # 选中为详情,如果有问题,需要设置为断点
t.sleep(5)
html_now = driver.page_source # 页面源码
print('ok!')
except:
html_now = '下一个'
finally:
return html_now
def pull(html):
"""提取一页的论文条目、关键词和当前页面数"""
soup = BeautifulSoup(html, 'html.parser') # 解析器html.parser
try:
page = soup.select('.countPageMark') # 页面计数
count = page[0].text
except:
count = 1
title = soup.select('.middle>h6>a')
titles = [] # 纯标题
clear(title, titles)
journal = soup.select('.middle p.baseinfo span a ') # 期刊名
date = soup.select('.middle p.baseinfo span.date') # 发表时间
journals_o = [] # 取出字符
journals = [] # 最终结果
clear_jou(journal, journals_o)
for i in journals_o:
if i.isdigit(): # 如果该项为数字
pass
else:
journals.append(i)
dates = []
clear(date, dates)
abstract = soup.select('.abstract') # 摘要
abstracts = []
clear_ab(abstract, abstracts)
keyword = soup.select('.keywords>a') # 关键词
keywords = []
clear(keyword, keywords)
page = [] # 除了关键词的所有信息
for i in range(len(titles)):
page.append(titles[i:i + 1] + journals[i:i + 1] + dates[i:i + 1] + abstracts[i:i + 1])
return page, keywords, count
def one_n_save(fund, count_number):
"""保存一个基金号的相关数据"""
alls = [] # 一个基金的所有页面
keywords = [] # 一个基金的所有关键词
all, key_words, count = pull(get_html(str(fund), count_number)) # 第一页的数据
count = str(count)
count = count.replace('1/', '')
alls.append(all) # 存储第一页的数据
keywords.append(key_words) # 存储第一页的关键词
t.sleep(5)
# 一个基金的大部分数据,关键词,页数
while True:
if 1 < int(count) < 3: # 只有两页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_1 = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
break
elif int(count) >= 3: # 大于两页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_2 = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
for i in range(int(count) - 2): # 翻几次页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[4]').click() # 点击翻页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_go = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
break
else:
break
save_afile(alls, keywords, str(fund))
print("成功!")
# inpath = '列表.xlsx'#excel文件所在路径
# ns=extract(inpath)#基金号列表
count_number = 0
# 只能存储有论文的
#
i = '14BTQ073' # 单个基金号的论文元数据爬取,多个遍历即可
# for i in ns:
one_n_save(i, count_number) # 保存这一基金号的
print(str(i) + '基金号的所有论文基本信息保存完毕!') # 显示成功信息
# count_number=count_number+1
driver.quit() # 关闭浏览器
print('Over') # 全部完成
# 本程序仅能自动获取有论文的情况
# 出现了被引数错误的情况——clear_c有问题
# 出现了下载数出现在被引数的情况——获取被引数和下载量有问题
# 出现了事实上下载量和被引数都没有但写入到excel的情况定位同上
# 决定放弃被引数和下载量的爬取
# 将被引数和下载量放在另一个程序中