keyan/te_u/paper_down_load/eccv_download.py

659 lines
28 KiB
Python
Raw Permalink Normal View History

2024-06-17 14:04:28 +08:00
from bs4 import BeautifulSoup
import pickle
import os
os.environ['http_proxy'] = '127.0.0.1:7890'
os.environ['https_proxy'] = '127.0.0.1:7890'
from tqdm import tqdm
from slugify import slugify
import csv
import sys
import urllib
import random
from urllib.error import URLError, HTTPError
import requests
class Downloader:
def __init__(self, downloader=None, is_random_step=None):
pass
def download(self, urls=None, save_path=None, time_sleep_in_seconds=None):
print(urls)
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
content = urlopen_with_retry(url=urls, headers=headers)
with open(save_path, 'wb') as f:
f.write(content)
def download_from_csv_i(
postfix=None, save_dir=None, csv_file_path=None, is_download_main_paper=True,
is_download_bib=True, is_download_supplement=True,
time_step_in_seconds=5, total_paper_number=None,
downloader='IDM', is_random_step=True):
"""
download paper, bibtex and supplement files and save them to
save_dir/main_paper and save_dir/supplement respectively
:param postfix: str, postfix that will be added at the end of papers' title
:param save_dir: str, paper and supplement material's save path
:param csv_file_path: str, the full path to csv file
:param is_download_main_paper: bool, True for downloading main paper
:param is_download_supplement: bool, True for downloading supplemental
material
:param time_step_in_seconds: int, the interval time between two downloading
request in seconds
:param total_paper_number: int, the total number of papers that is going to
download
:param downloader: str, the downloader to download, could be 'IDM' or None,
default to 'IDM'.
:param is_random_step: bool, whether random sample the time step between two
adjacent download requests. If True, the time step will be sampled
from Uniform(0.5t, 1.5t), where t is the given time_step_in_seconds.
Default: True.
:return: True
"""
downloader = Downloader(
downloader=downloader, is_random_step=is_random_step)
if not os.path.exists(csv_file_path):
raise ValueError(f'ERROR: file not found in {csv_file_path}!!!')
main_save_path = os.path.join(save_dir, 'main_paper')
if is_download_main_paper:
os.makedirs(main_save_path, exist_ok=True)
if is_download_supplement:
supplement_save_path = os.path.join(save_dir, 'supplement')
os.makedirs(supplement_save_path, exist_ok=True)
error_log = []
with open(csv_file_path, newline='') as csvfile:
myreader = csv.DictReader(csvfile, delimiter=',')
pbar = tqdm(myreader, total=total_paper_number)
i = 0
for this_paper in pbar:
is_download_bib &= ('bib' in this_paper)
is_grouped = ('group' in this_paper)
i += 1
# get title
if is_grouped:
group = slugify(this_paper['group'])
title = slugify(this_paper['title'])
if total_paper_number is not None:
pbar.set_description(
f'Downloading {postfix} paper {i} /{total_paper_number}')
else:
pbar.set_description(f'Downloading {postfix} paper {i}')
this_paper_main_path = os.path.join(
main_save_path, f'{title}_{postfix}.pdf')
if is_grouped:
this_paper_main_path = os.path.join(
main_save_path, group, f'{title}_{postfix}.pdf')
if is_download_supplement:
this_paper_supp_path_no_ext = os.path.join(
supplement_save_path, f'{title}_{postfix}_supp.')
if is_grouped:
this_paper_supp_path_no_ext = os.path.join(
supplement_save_path, group, f'{title}_{postfix}_supp.')
if '' != this_paper['supplemental link'] and os.path.exists(
this_paper_main_path) and \
(os.path.exists(
this_paper_supp_path_no_ext + 'zip') or
os.path.exists(
this_paper_supp_path_no_ext + 'pdf')):
continue
elif '' == this_paper['supplemental link'] and \
os.path.exists(this_paper_main_path):
continue
elif os.path.exists(this_paper_main_path):
continue
if 'error' == this_paper['main link']:
error_log.append((title, 'no MAIN link'))
elif '' != this_paper['main link']:
if is_grouped:
if is_download_main_paper:
os.makedirs(os.path.join(main_save_path, group),
exist_ok=True)
if is_download_supplement:
os.makedirs(os.path.join(supplement_save_path, group),
exist_ok=True)
if is_download_main_paper:
try:
# download paper with IDM
if not os.path.exists(this_paper_main_path):
downloader.download(
urls=this_paper['main link'].replace(
' ', '%20'),
save_path=os.path.join(
os.getcwd(), this_paper_main_path),
time_sleep_in_seconds=time_step_in_seconds
)
except Exception as e:
# error_flag = True
print('Error: ' + title + ' - ' + str(e))
error_log.append((title, this_paper['main link'],
'main paper download error', str(e)))
# download supp
if is_download_supplement:
# check whether the supp can be downloaded
if not (os.path.exists(
this_paper_supp_path_no_ext + 'zip') or
os.path.exists(
this_paper_supp_path_no_ext + 'pdf')):
if 'error' == this_paper['supplemental link']:
error_log.append((title, 'no SUPPLEMENTAL link'))
elif '' != this_paper['supplemental link']:
supp_type = \
this_paper['supplemental link'].split('.')[-1]
try:
downloader.download(
urls=this_paper['supplemental link'],
save_path=os.path.join(
os.getcwd(),
this_paper_supp_path_no_ext + supp_type),
time_sleep_in_seconds=time_step_in_seconds
)
except Exception as e:
# error_flag = True
print('Error: ' + title + ' - ' + str(e))
error_log.append((title, this_paper[
'supplemental link'],
'supplement download error',
str(e)))
# download bibtex file
if is_download_bib:
bib_path = this_paper_main_path[:-3] + 'bib'
if not os.path.exists(bib_path):
if 'error' == this_paper['bib']:
error_log.append((title, 'no bibtex link'))
elif '' != this_paper['bib']:
try:
downloader.download(
urls=this_paper['bib'],
save_path=os.path.join(os.getcwd(),
bib_path),
time_sleep_in_seconds=time_step_in_seconds
)
except Exception as e:
# error_flag = True
print('Error: ' + title + ' - ' + str(e))
error_log.append((title, this_paper['bib'],
'bibtex download error',
str(e)))
# 2. write error log
print('write error log')
return True
def get_paper_name_link_from_url(url):
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
paper_dict = dict()
content = urlopen_with_retry(url=url, headers=headers)
soup = BeautifulSoup(content, 'html5lib')
paper_list_bar = tqdm(soup.find_all(['li'], {'class': 'chapter-item content-type-list__item'}))
for paper in paper_list_bar:
try:
title = slugify(paper.find('div', {'class': 'content-type-list__title'}).text)
link = urllib.parse.urljoin(url, paper.find('div', {'class': 'content-type-list__action'}).a.get('href'))
paper_dict[title] = link
except Exception as e:
print(f'ERROR: {str(e)}')
return paper_dict
def urlopen_with_retry(url, headers=dict(), retry_time=3, time_out=20,
raise_error_if_failed=True):
"""
load content from url with given headers. Retry if error occurs.
Args:
url (str): url.
headers (dict): request headers. Default: {}.
retry_time (int): max retry time. Default: 3.
time_out (int): time out in seconds. Default: 10.
raise_error_if_failed (bool): whether to raise error if failed.
Default: True.
Returns:
content(str|None): url content. None will be returned if failed.
"""
res = requests.get(url=url, headers=headers)
# req = urllib.request.Request(url=url, headers=headers)
for r in range(retry_time):
try:
# content = urllib.request.urlopen(req, timeout=time_out).read()
content = res.content
return content
except HTTPError as e:
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
s = random.randint(3, 7)
print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}'
f'-th retrying...')
except URLError as e:
print('We failed to reach a server.')
print('Reason: ', e.reason)
s = random.randint(3, 7)
print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}'
f'-th retrying...')
if raise_error_if_failed:
raise ValueError(f'Failed to open {url} after trying {retry_time} '
f'times!')
else:
return None
def save_csv(year):
"""
write ECCV papers' and supplemental material's urls in one csv file
:param year: int
:return: True
"""
project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load"
csv_file_pathname = os.path.join(
project_root_folder, 'csv', f'ECCV_{year}.csv')
with open(csv_file_pathname, 'w', newline='') as csvfile:
fieldnames = ['title', 'main link', 'supplemental link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) '
'Gecko/20100101 Firefox/23.0'}
dat_file_pathname = os.path.join(
project_root_folder, 'urls', f'init_url_ECCV_{year}.dat')
if year >= 2018:
init_url = f'https://www.ecva.net/papers.php'
if os.path.exists(dat_file_pathname):
with open(dat_file_pathname, 'rb') as f:
content = pickle.load(f)
else:
content = urlopen_with_retry(url=init_url, headers=headers)
with open(dat_file_pathname, 'wb') as f:
pickle.dump(content, f)
soup = BeautifulSoup(content, 'html5lib')
paper_list_bar = tqdm(soup.find_all(['dt', 'dd']))
paper_index = 0
paper_dict = {'title': '',
'main link': '',
'supplemental link': ''}
for paper in paper_list_bar:
is_new_paper = False
# get title
try:
if 'dt' == paper.name and \
'ptitle' == paper.get('class')[0] and \
year == int(paper.a.get('href').split('_')[1][:4]): # title:
# this_year = int(paper.a.get('href').split('_')[1][:4])
title = slugify(paper.text.strip())
paper_dict['title'] = title
paper_index += 1
paper_list_bar.set_description_str(
f'Downloading paper {paper_index}: {title}')
elif '' != paper_dict['title'] and 'dd' == paper.name:
all_as = paper.find_all('a')
for a in all_as:
if 'pdf' == slugify(a.text.strip()):
main_link = urllib.parse.urljoin(init_url,
a.get('href'))
paper_dict['main link'] = main_link
is_new_paper = True
elif 'supp' == slugify(a.text.strip())[:4]:
supp_link = urllib.parse.urljoin(init_url,
a.get('href'))
paper_dict['supplemental link'] = supp_link
break
except:
pass
if is_new_paper:
writer.writerow(paper_dict)
paper_dict = {'title': '',
'main link': '',
'supplemental link': ''}
else:
init_url = f'http://www.eccv{year}.org/main-conference/'
if os.path.exists(dat_file_pathname):
with open(dat_file_pathname, 'rb') as f:
content = pickle.load(f)
else:
content = urlopen_with_retry(url=init_url, headers=headers)
with open(dat_file_pathname, 'wb') as f:
pickle.dump(content, f)
soup = BeautifulSoup(content, 'html5lib')
paper_list_bar = tqdm(
soup.find('div', {'class': 'entry-content'}).find_all(['p']))
paper_index = 0
paper_dict = {'title': '',
'main link': '',
'supplemental link': ''}
for paper in paper_list_bar:
try:
if len(paper.find_all(['strong'])) and len(
paper.find_all(['a'])) and len(paper.find_all(['img'])):
paper_index += 1
title = slugify(paper.find('strong').text)
paper_dict['title'] = title
paper_list_bar.set_description_str(
f'Downloading paper {paper_index}: {title}')
main_link = paper.find('a').get('href')
paper_dict['main link'] = main_link
writer.writerow(paper_dict)
paper_dict = {'title': '',
'main link': '',
'supplemental link': ''}
except Exception as e:
print(f'ERROR: {str(e)}')
return paper_index
def download_from_csv(
year, save_dir, is_download_supplement=True, time_step_in_seconds=5,
total_paper_number=None,
is_workshops=False, downloader='IDM'):
"""
download all ECCV paper and supplement files given year, restore in
save_dir/main_paper and save_dir/supplement respectively
:param year: int, ECCV year, such 2019
:param save_dir: str, paper and supplement material's save path
:param is_download_supplement: bool, True for downloading supplemental
material
:param time_step_in_seconds: int, the interval time between two downlaod
request in seconds
:param total_paper_number: int, the total number of papers that is going
to download
:param is_workshops: bool, is to download workshops from csv file.
:param downloader: str, the downloader to download, could be 'IDM' or
'Thunder', default to 'IDM'
:return: True
"""
postfix = f'ECCV_{year}'
if is_workshops:
postfix = f'ECCV_WS_{year}'
csv_file_name = f'ECCV_{year}.csv' if not is_workshops else \
f'ECCV_WS_{year}.csv'
project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load"
csv_file_name = os.path.join(project_root_folder, 'csv', csv_file_name)
download_from_csv_i(
postfix=postfix,
save_dir=save_dir,
csv_file_path=csv_file_name,
is_download_supplement=is_download_supplement,
time_step_in_seconds=time_step_in_seconds,
total_paper_number=total_paper_number,
downloader=downloader
)
def download_from_springer(
year, save_dir, is_workshops=False, time_sleep_in_seconds=5,
downloader='IDM'):
os.makedirs(save_dir, exist_ok=True)
if 2018 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-030-01246-5',
'https://link.springer.com/book/10.1007/978-3-030-01216-8',
'https://link.springer.com/book/10.1007/978-3-030-01219-9',
'https://link.springer.com/book/10.1007/978-3-030-01225-0',
'https://link.springer.com/book/10.1007/978-3-030-01228-1',
'https://link.springer.com/book/10.1007/978-3-030-01231-1',
'https://link.springer.com/book/10.1007/978-3-030-01234-2',
'https://link.springer.com/book/10.1007/978-3-030-01237-3',
'https://link.springer.com/book/10.1007/978-3-030-01240-3',
'https://link.springer.com/book/10.1007/978-3-030-01249-6',
'https://link.springer.com/book/10.1007/978-3-030-01252-6',
'https://link.springer.com/book/10.1007/978-3-030-01258-8',
'https://link.springer.com/book/10.1007/978-3-030-01261-8',
'https://link.springer.com/book/10.1007/978-3-030-01264-9',
'https://link.springer.com/book/10.1007/978-3-030-01267-0',
'https://link.springer.com/book/10.1007/978-3-030-01270-0'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-030-11009-3',
'https://link.springer.com/book/10.1007/978-3-030-11012-3',
'https://link.springer.com/book/10.1007/978-3-030-11015-4',
'https://link.springer.com/book/10.1007/978-3-030-11018-5',
'https://link.springer.com/book/10.1007/978-3-030-11021-5',
'https://link.springer.com/book/10.1007/978-3-030-11024-6'
]
elif 2016 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007%2F978-3-319-46448-0',
'https://link.springer.com/book/10.1007%2F978-3-319-46475-6',
'https://link.springer.com/book/10.1007%2F978-3-319-46487-9',
'https://link.springer.com/book/10.1007%2F978-3-319-46493-0',
'https://link.springer.com/book/10.1007%2F978-3-319-46454-1',
'https://link.springer.com/book/10.1007%2F978-3-319-46466-4',
'https://link.springer.com/book/10.1007%2F978-3-319-46478-7',
'https://link.springer.com/book/10.1007%2F978-3-319-46484-8'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007%2F978-3-319-46604-0',
'https://link.springer.com/book/10.1007%2F978-3-319-48881-3',
'https://link.springer.com/book/10.1007%2F978-3-319-49409-8'
]
elif 2014 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-319-10590-1',
'https://link.springer.com/book/10.1007/978-3-319-10605-2',
'https://link.springer.com/book/10.1007/978-3-319-10578-9',
'https://link.springer.com/book/10.1007/978-3-319-10593-2',
'https://link.springer.com/book/10.1007/978-3-319-10602-1',
'https://link.springer.com/book/10.1007/978-3-319-10599-4',
'https://link.springer.com/book/10.1007/978-3-319-10584-0'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-319-16178-5',
'https://link.springer.com/book/10.1007/978-3-319-16181-5',
'https://link.springer.com/book/10.1007/978-3-319-16199-0',
'https://link.springer.com/book/10.1007/978-3-319-16220-1'
]
elif 2012 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-642-33718-5',
'https://link.springer.com/book/10.1007/978-3-642-33709-3',
'https://link.springer.com/book/10.1007/978-3-642-33712-3',
'https://link.springer.com/book/10.1007/978-3-642-33765-9',
'https://link.springer.com/book/10.1007/978-3-642-33715-4',
'https://link.springer.com/book/10.1007/978-3-642-33783-3',
'https://link.springer.com/book/10.1007/978-3-642-33786-4'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-642-33863-2',
'https://link.springer.com/book/10.1007/978-3-642-33868-7',
'https://link.springer.com/book/10.1007/978-3-642-33885-4'
]
elif 2010 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-642-15549-9',
'https://link.springer.com/book/10.1007/978-3-642-15552-9',
'https://link.springer.com/book/10.1007/978-3-642-15558-1',
'https://link.springer.com/book/10.1007/978-3-642-15561-1',
'https://link.springer.com/book/10.1007/978-3-642-15555-0',
'https://link.springer.com/book/10.1007/978-3-642-15567-3'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-642-35749-7',
'https://link.springer.com/book/10.1007/978-3-642-35740-4'
]
elif 2008 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/978-3-540-88682-2',
'https://link.springer.com/book/10.1007/978-3-540-88688-4',
'https://link.springer.com/book/10.1007/978-3-540-88690-7',
'https://link.springer.com/book/10.1007/978-3-540-88693-8'
]
else:
urls_list = []
elif 2006 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/11744023',
'https://link.springer.com/book/10.1007/11744047',
'https://link.springer.com/book/10.1007/11744078',
'https://link.springer.com/book/10.1007/11744085'
]
else:
urls_list = [
'https://link.springer.com/book/10.1007/11754336'
]
elif 2004 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/b97865',
'https://link.springer.com/book/10.1007/b97866',
'https://link.springer.com/book/10.1007/b97871',
'https://link.springer.com/book/10.1007/b97873'
]
else:
urls_list = [
]
elif 2002 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/3-540-47969-4',
'https://link.springer.com/book/10.1007/3-540-47967-8',
'https://link.springer.com/book/10.1007/3-540-47977-5',
'https://link.springer.com/book/10.1007/3-540-47979-1'
]
else:
urls_list = [
]
elif 2000 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/3-540-45054-8',
'https://link.springer.com/book/10.1007/3-540-45053-X'
]
else:
urls_list = [
]
elif 1998 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/BFb0055655',
'https://link.springer.com/book/10.1007/BFb0054729'
]
else:
urls_list = [
]
elif 1996 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/BFb0015518',
'https://link.springer.com/book/10.1007/3-540-61123-1'
]
else:
urls_list = [
]
elif 1994 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/3-540-57956-7',
'https://link.springer.com/book/10.1007/BFb0028329'
]
else:
urls_list = [
]
elif 1992 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/3-540-55426-2'
]
else:
urls_list = [
]
elif 1990 == year:
if not is_workshops:
urls_list = [
'https://link.springer.com/book/10.1007/BFb0014843'
]
else:
urls_list = [
]
else:
raise ValueError(f'ECCV {year} is current not available!')
for url in urls_list:
__download_from_springer(
url, save_dir, year, is_workshops=is_workshops,
time_sleep_in_seconds=time_sleep_in_seconds,
downloader=downloader)
def __download_from_springer(
url, save_dir, year, is_workshops=False, time_sleep_in_seconds=5,
downloader='IDM'):
downloader = Downloader(downloader)
for i in range(3):
try:
papers_dict = get_paper_name_link_from_url(url)
break
except Exception as e:
print(str(e))
# total_paper_number = len(papers_dict)
pbar = tqdm(papers_dict.keys())
postfix = f'ECCV_{year}'
if is_workshops:
postfix = f'ECCV_WS_{year}'
for name in pbar:
pbar.set_description(f'Downloading paper {name}')
if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')):
downloader.download(
papers_dict[name],
os.path.join(save_dir, f'{name}_{postfix}.pdf'),
time_sleep_in_seconds)
if __name__ == '__main__':
year = 2022
# total_paper_number = 1645
total_paper_number = save_csv(year)
download_from_csv(year,
save_dir=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}',
is_download_supplement=False,
time_step_in_seconds=5,
total_paper_number=total_paper_number,
is_workshops=False)
# move_main_and_supplement_2_one_directory(
# main_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper',
# supplement_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\supplement',
# supp_pdf_save_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper'
# )
# for year in range(2018, 2017, -2):
# # download_from_springer(
# # save_dir=f'F:\\ECCV_{year}',
# # year=year,
# # is_workshops=False, time_sleep_in_seconds=30)
# download_from_springer(
# save_dir=f'F:\\ECCV_WS_{year}',
# year=year,
# is_workshops=True, time_sleep_in_seconds=30)
# pass