283 lines
10 KiB
Python
283 lines
10 KiB
Python
# coding='utf-8'
|
||
from selenium import webdriver
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.select import Select
|
||
from selenium.webdriver.common.alert import Alert
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
import time as t
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
import xlrd
|
||
import xlwt
|
||
import os
|
||
|
||
import undetected_chromedriver as uc
|
||
|
||
# 先进入浏览器知网
|
||
driver = uc.Chrome()
|
||
# driver.minimize_window() # 浏览器窗口最小化,只显示dos窗口
|
||
driver.get('https://www.cnki.net/')
|
||
|
||
keywords = ["对抗攻击"]
|
||
|
||
# 选到“关键词所在的”li
|
||
# //a[text()='关键词']/..
|
||
# a = driver.find_element(by="xpath", value="//a[text()='关键词']/..")
|
||
# driver.execute_script("arguments[0].className = 'cur';", a)
|
||
|
||
# 找到input
|
||
input_button = driver.find_element(by="id", value="txt_SearchText")
|
||
input_button.send_keys("对抗攻击")
|
||
search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
|
||
search_bn.click()
|
||
|
||
result_area = driver.find_element(by="id", value="ModuleSearchResult")
|
||
current_page_resluts = result_area.find_elements(by="xpath", value='//*[@id="ModuleSearchResult"]//tbody/tr')
|
||
|
||
names = [r.find_element(by="xpath", value='//td[@class="name"]') for r in current_page_resluts]
|
||
links = [r.find_element(by="xpath", value='//td[@class="name"]/a').get_attribute("href") for r in current_page_resluts]
|
||
driver.get(links[0]) # 获取新的论文链接‘
|
||
|
||
# 下一页 //a[contains(text(), "下一页")]
|
||
next_page_btn = driver.find_element(by="xpath", value='//a[contains(text(), "下一页")]')
|
||
next_page_btn.click()
|
||
|
||
|
||
def cut(list, n):
|
||
"""将列表按特定数量切分成小列表"""
|
||
for i in range(0, len(list), n):
|
||
yield list[i:i + n]
|
||
|
||
|
||
def clear(old_list, new_list):
|
||
"""用于清洗出纯文本"""
|
||
for i in old_list:
|
||
n = (i.text).strip()
|
||
n = n.replace('\n', ' ')
|
||
new_list.append(n)
|
||
return new_list
|
||
|
||
|
||
def clear_jou(old_list, new_list):
|
||
"""用于清洗出期刊的纯文本"""
|
||
for i in old_list:
|
||
n = (i.text).strip()
|
||
n = n.replace('\n', ' ')
|
||
new_list.append(n)
|
||
return new_list
|
||
|
||
|
||
def clear_ab(old_list, new_list):
|
||
"""用于清洗出摘要的纯文本"""
|
||
for i in old_list:
|
||
n = (i.text).strip()
|
||
n = n.replace('\n', '')
|
||
n = n.replace('摘要:', '')
|
||
n = n.replace(' ', '')
|
||
new_list.append(n)
|
||
return new_list
|
||
|
||
|
||
def clear_c(old_list, new_list):
|
||
"""用于清洗出被引数的纯文本"""
|
||
for i in old_list:
|
||
n = str(i)
|
||
n = n.replace('\n', '')
|
||
new_list.append(i)
|
||
return new_list
|
||
|
||
|
||
def clear_d(old_list, new_list):
|
||
"""用于清洗出下载量的纯文本"""
|
||
for i in old_list:
|
||
n = (i.text).strip()
|
||
n = n.replace('\n', ' ')
|
||
n = int(n)
|
||
new_list.append(n)
|
||
return new_list
|
||
|
||
|
||
def extract(inpath):
|
||
"""取出基金号"""
|
||
data = xlrd.open_workbook(inpath, encoding_override='utf-8')
|
||
table = data.sheets()[0] # 选定表
|
||
nrows = table.nrows # 获取行号
|
||
ncols = table.ncols # 获取列号
|
||
numbers = []
|
||
for i in range(1, nrows): # 第0行为表头
|
||
alldata = table.row_values(i) # 循环输出excel表中每一行,即所有数据
|
||
result = alldata[4] # 取出表中第一列数据
|
||
numbers.append(result)
|
||
return numbers
|
||
|
||
|
||
def save_afile(alls, keywords, file):
|
||
os.chdir(r'F:\图情社科基金项目数据爬取\论文信息') # 进入要保存的文件夹
|
||
"""将一个基金的论文数据保存在一个excel"""
|
||
f = xlwt.Workbook()
|
||
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
|
||
sheet1.write(0, 0, '题目')
|
||
sheet1.write(0, 1, '发表期刊')
|
||
sheet1.write(0, 2, '出版时间')
|
||
sheet1.write(0, 3, '摘要')
|
||
i = 1
|
||
for all in alls: # 遍历每一页
|
||
for data in all: # 遍历每一行
|
||
for j in range(len(data)): # 取每一单元格
|
||
sheet1.write(i, j, data[j]) # 写入单元格
|
||
i = i + 1 # 往下一行
|
||
f.save(file + '.xls')
|
||
# 保存关键词为txt
|
||
file = open(file + '.txt', 'w')
|
||
for key in keywords:
|
||
file.write(str(key))
|
||
file.write('\n')
|
||
file.close()
|
||
|
||
|
||
def get_html(number, count_number):
|
||
"""火狐模拟并获得当前源码
|
||
第一个是网址self.url,第二个是基金号,需要导入基金号列表
|
||
"""
|
||
"""火狐模拟并获得当前源码
|
||
第一个是基金号,第二个是计数器
|
||
"""
|
||
s_2 = '/html/body/div[4]/div/div[2]/div[1]/input[1]'
|
||
s_1 = '//*[@id="txt_SearchText"]'
|
||
if count_number == 0:
|
||
element = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div/div[1]/span') # 鼠标悬浮
|
||
ActionChains(driver).move_to_element(element).perform()
|
||
t.sleep(2)
|
||
driver.find_element_by_link_text(u'基金').click() # 选中为基金检索模式
|
||
driver.find_element_by_xpath(s_1).send_keys(str(number)) # 键入基金号
|
||
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/input[2]').click() # 进行搜索
|
||
else:
|
||
driver.find_element_by_xpath(s_2).clear() # 清除内容
|
||
driver.find_element_by_xpath(s_2).send_keys(str(number)) # 键入基金号
|
||
driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div[1]/input[2]').click() # 进行搜索
|
||
t.sleep(2)
|
||
try:
|
||
driver.find_element_by_css_selector('#DivDisplayMode > li:nth-child(1)').click() # 选中为详情,如果有问题,需要设置为断点
|
||
t.sleep(5)
|
||
html_now = driver.page_source # 页面源码
|
||
print('ok!')
|
||
except:
|
||
html_now = '下一个'
|
||
finally:
|
||
return html_now
|
||
|
||
|
||
def pull(html):
|
||
"""提取一页的论文条目、关键词和当前页面数"""
|
||
soup = BeautifulSoup(html, 'html.parser') # 解析器:html.parser
|
||
try:
|
||
page = soup.select('.countPageMark') # 页面计数
|
||
count = page[0].text
|
||
except:
|
||
count = 1
|
||
|
||
title = soup.select('.middle>h6>a')
|
||
titles = [] # 纯标题
|
||
clear(title, titles)
|
||
|
||
journal = soup.select('.middle p.baseinfo span a ') # 期刊名
|
||
date = soup.select('.middle p.baseinfo span.date') # 发表时间
|
||
|
||
journals_o = [] # 取出字符
|
||
journals = [] # 最终结果
|
||
clear_jou(journal, journals_o)
|
||
for i in journals_o:
|
||
if i.isdigit(): # 如果该项为数字
|
||
pass
|
||
else:
|
||
journals.append(i)
|
||
|
||
dates = []
|
||
clear(date, dates)
|
||
|
||
abstract = soup.select('.abstract') # 摘要
|
||
abstracts = []
|
||
clear_ab(abstract, abstracts)
|
||
keyword = soup.select('.keywords>a') # 关键词
|
||
keywords = []
|
||
clear(keyword, keywords)
|
||
page = [] # 除了关键词的所有信息
|
||
for i in range(len(titles)):
|
||
page.append(titles[i:i + 1] + journals[i:i + 1] + dates[i:i + 1] + abstracts[i:i + 1])
|
||
return page, keywords, count
|
||
|
||
|
||
def one_n_save(fund, count_number):
|
||
"""保存一个基金号的相关数据"""
|
||
alls = [] # 一个基金的所有页面
|
||
keywords = [] # 一个基金的所有关键词
|
||
all, key_words, count = pull(get_html(str(fund), count_number)) # 第一页的数据
|
||
count = str(count)
|
||
count = count.replace('1/', '')
|
||
alls.append(all) # 存储第一页的数据
|
||
keywords.append(key_words) # 存储第一页的关键词
|
||
t.sleep(5)
|
||
# 一个基金的大部分数据,关键词,页数
|
||
while True:
|
||
if 1 < int(count) < 3: # 只有两页
|
||
t.sleep(5)
|
||
try:
|
||
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
|
||
except:
|
||
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
|
||
t.sleep(5)
|
||
html_a = driver.page_source # 当前页面源码
|
||
all, key_words, count_1 = pull(html_a)
|
||
alls.append(all) # 存储当页的数据
|
||
keywords.append(key_words)
|
||
break
|
||
elif int(count) >= 3: # 大于两页
|
||
t.sleep(5)
|
||
try:
|
||
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
|
||
except:
|
||
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[3]').click() # 点击翻到第二页
|
||
t.sleep(5)
|
||
html_a = driver.page_source # 当前页面源码
|
||
all, key_words, count_2 = pull(html_a)
|
||
alls.append(all) # 存储当页的数据
|
||
keywords.append(key_words)
|
||
for i in range(int(count) - 2): # 翻几次页
|
||
t.sleep(5)
|
||
try:
|
||
driver.find_element_by_xpath('//*[@id="Page_next_top"]').click() # 点击翻到第二页
|
||
except:
|
||
driver.find_element_by_xpath('/html/body/div[5]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span[4]').click() # 点击翻页
|
||
t.sleep(5)
|
||
html_a = driver.page_source # 当前页面源码
|
||
all, key_words, count_go = pull(html_a)
|
||
alls.append(all) # 存储当页的数据
|
||
keywords.append(key_words)
|
||
break
|
||
else:
|
||
break
|
||
save_afile(alls, keywords, str(fund))
|
||
print("成功!")
|
||
|
||
|
||
# inpath = '列表.xlsx'#excel文件所在路径
|
||
# ns=extract(inpath)#基金号列表
|
||
count_number = 0
|
||
# 只能存储有论文的
|
||
#
|
||
i = '14BTQ073' # 单个基金号的论文元数据爬取,多个遍历即可
|
||
# for i in ns:
|
||
one_n_save(i, count_number) # 保存这一基金号的
|
||
print(str(i) + '基金号的所有论文基本信息保存完毕!') # 显示成功信息
|
||
# count_number=count_number+1
|
||
driver.quit() # 关闭浏览器
|
||
print('Over!') # 全部完成
|
||
|
||
# 本程序仅能自动获取有论文的情况
|
||
# 出现了被引数错误的情况——clear_c有问题
|
||
# 出现了下载数出现在被引数的情况——获取被引数和下载量有问题
|
||
# 出现了事实上下载量和被引数都没有但写入到excel的情况,定位同上
|
||
# 决定放弃被引数和下载量的爬取
|
||
# 将被引数和下载量放在另一个程序中
|