keyan/论文信息爬取(题目、期刊、日期、摘要、关键词)_1.py

283 lines
10 KiB
Python
Raw Normal View History

2024-06-17 14:04:28 +08:00
# coding='utf-8'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.action_chains import ActionChains
import time as t
import re
from bs4 import BeautifulSoup
import xlrd
import xlwt
import os
import undetected_chromedriver as uc
# 先进入浏览器知网
driver = uc.Chrome()
# driver.minimize_window() # 浏览器窗口最小化只显示dos窗口
driver.get('https://www.cnki.net/')
keywords = ["对抗攻击"]
# 选到“关键词所在的”li
# //a[text()='关键词']/..
# a = driver.find_element(by="xpath", value="//a[text()='关键词']/..")
# driver.execute_script("arguments[0].className = 'cur';", a)
# 找到input
input_button = driver.find_element(by="id", value="txt_SearchText")
input_button.send_keys("对抗攻击")
search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
search_bn.click()
result_area = driver.find_element(by="id", value="ModuleSearchResult")
current_page_resluts = result_area.find_elements(by="xpath", value='//*[@id="ModuleSearchResult"]//tbody/tr')
names = [r.find_element(by="xpath", value='//td[@class="name"]') for r in current_page_resluts]
links = [r.find_element(by="xpath", value='//td[@class="name"]/a').get_attribute("href") for r in current_page_resluts]
driver.get(links[0]) # 获取新的论文链接‘
# 下一页 //a[contains(text(), "下一页")]
next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]')
next_page_btn.click()
def cut(list, n):
"""将列表按特定数量切分成小列表"""
for i in range(0, len(list), n):
yield list[i:i + n]
def clear(old_list, new_list):
"""用于清洗出纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
new_list.append(n)
return new_list
def clear_jou(old_list, new_list):
"""用于清洗出期刊的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
new_list.append(n)
return new_list
def clear_ab(old_list, new_list):
"""用于清洗出摘要的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', '')
n = n.replace('摘要:', '')
n = n.replace(' ', '')
new_list.append(n)
return new_list
def clear_c(old_list, new_list):
"""用于清洗出被引数的纯文本"""
for i in old_list:
n = str(i)
n = n.replace('\n', '')
new_list.append(i)
return new_list
def clear_d(old_list, new_list):
"""用于清洗出下载量的纯文本"""
for i in old_list:
n = (i.text).strip()
n = n.replace('\n', ' ')
n = int(n)
new_list.append(n)
return new_list
def extract(inpath):
"""取出基金号"""
data = xlrd.open_workbook(inpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
numbers = []
for i in range(1, nrows): # 第0行为表头
alldata = table.row_values(i) # 循环输出excel表中每一行即所有数据
result = alldata[4] # 取出表中第一列数据
numbers.append(result)
return numbers
def save_afile(alls, keywords, file):
os.chdir(r'F:\图情社科基金项目数据爬取\论文信息') # 进入要保存的文件夹
"""将一个基金的论文数据保存在一个excel"""
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet1.write(0, 0, '题目')
sheet1.write(0, 1, '发表期刊')
sheet1.write(0, 2, '出版时间')
sheet1.write(0, 3, '摘要')
i = 1
for all in alls: # 遍历每一页
for data in all: # 遍历每一行
for j in range(len(data)): # 取每一单元格
sheet1.write(i, j, data[j]) # 写入单元格
i = i + 1 # 往下一行
f.save(file + '.xls')
# 保存关键词为txt
file = open(file + '.txt', 'w')
for key in keywords:
file.write(str(key))
file.write('\n')
file.close()
def get_html(number, count_number):
"""火狐模拟并获得当前源码
第一个是网址self.url,第二个是基金号需要导入基金号列表
"""
"""火狐模拟并获得当前源码
第一个是基金号,第二个是计数器
"""
s_2 = '/html/body/div[4]/div/div[2]/div[1]/input[1]'
s_1 = '//*[@id="txt_SearchText"]'
if count_number == 0:
element = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div/div[1]/span') # 鼠标悬浮
ActionChains(driver).move_to_element(element).perform()
t.sleep(2)
driver.find_element_by_link_text(u'基金').click() # 选中为基金检索模式
driver.find_element_by_xpath(s_1).send_keys(str(number)) # 键入基金号
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/input[2]').click() # 进行搜索
else:
driver.find_element_by_xpath(s_2).clear() # 清除内容
driver.find_element_by_xpath(s_2).send_keys(str(number)) # 键入基金号
driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div[1]/input[2]').click() # 进行搜索
t.sleep(2)
try:
driver.find_element_by_css_selector('#DivDisplayMode > li:nth-child(1)').click() # 选中为详情,如果有问题,需要设置为断点
t.sleep(5)
html_now = driver.page_source # 页面源码
print('ok!')
except:
html_now = '下一个'
finally:
return html_now
def pull(html):
"""提取一页的论文条目、关键词和当前页面数"""
soup = BeautifulSoup(html, 'html.parser') # 解析器html.parser
try:
page = soup.select('.countPageMark') # 页面计数
count = page[0].text
except:
count = 1
title = soup.select('.middle>h6>a')
titles = [] # 纯标题
clear(title, titles)
journal = soup.select('.middle p.baseinfo span a ') # 期刊名
date = soup.select('.middle p.baseinfo span.date') # 发表时间
journals_o = [] # 取出字符
journals = [] # 最终结果
clear_jou(journal, journals_o)
for i in journals_o:
if i.isdigit(): # 如果该项为数字
pass
else:
journals.append(i)
dates = []
clear(date, dates)
abstract = soup.select('.abstract') # 摘要
abstracts = []
clear_ab(abstract, abstracts)
keyword = soup.select('.keywords>a') # 关键词
keywords = []
clear(keyword, keywords)
page = [] # 除了关键词的所有信息
for i in range(len(titles)):
page.append(titles[i:i + 1] + journals[i:i + 1] + dates[i:i + 1] + abstracts[i:i + 1])
return page, keywords, count
def one_n_save(fund, count_number):
"""保存一个基金号的相关数据"""
alls = [] # 一个基金的所有页面
keywords = [] # 一个基金的所有关键词
all, key_words, count = pull(get_html(str(fund), count_number)) # 第一页的数据
count = str(count)
count = count.replace('1/', '')
alls.append(all) # 存储第一页的数据
keywords.append(key_words) # 存储第一页的关键词
t.sleep(5)
# 一个基金的大部分数据,关键词,页数
while True:
if 1 < int(count) < 3: # 只有两页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_1 = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
break
elif int(count) >= 3: # 大于两页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_2 = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
for i in range(int(count) - 2): # 翻几次页
t.sleep(5)
try:
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
except:
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[4]').click() # 点击翻页
t.sleep(5)
html_a = driver.page_source # 当前页面源码
all, key_words, count_go = pull(html_a)
alls.append(all) # 存储当页的数据
keywords.append(key_words)
break
else:
break
save_afile(alls, keywords, str(fund))
print("成功!")
# inpath = '列表.xlsx'#excel文件所在路径
# ns=extract(inpath)#基金号列表
count_number = 0
# 只能存储有论文的
#
i = '14BTQ073' # 单个基金号的论文元数据爬取,多个遍历即可
# for i in ns:
one_n_save(i, count_number) # 保存这一基金号的
print(str(i) + '基金号的所有论文基本信息保存完毕!') # 显示成功信息
# count_number=count_number+1
driver.quit() # 关闭浏览器
print('Over') # 全部完成
# 本程序仅能自动获取有论文的情况
# 出现了被引数错误的情况——clear_c有问题
# 出现了下载数出现在被引数的情况——获取被引数和下载量有问题
# 出现了事实上下载量和被引数都没有但写入到excel的情况定位同上
# 决定放弃被引数和下载量的爬取
# 将被引数和下载量放在另一个程序中