659 lines
28 KiB
Python
659 lines
28 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import pickle
|
||
|
import os
|
||
|
|
||
|
os.environ['http_proxy'] = '127.0.0.1:7890'
|
||
|
os.environ['https_proxy'] = '127.0.0.1:7890'
|
||
|
|
||
|
from tqdm import tqdm
|
||
|
from slugify import slugify
|
||
|
import csv
|
||
|
import sys
|
||
|
|
||
|
import urllib
|
||
|
import random
|
||
|
from urllib.error import URLError, HTTPError
|
||
|
|
||
|
import requests
|
||
|
|
||
|
|
||
|
class Downloader:
|
||
|
def __init__(self, downloader=None, is_random_step=None):
|
||
|
pass
|
||
|
|
||
|
def download(self, urls=None, save_path=None, time_sleep_in_seconds=None):
|
||
|
print(urls)
|
||
|
headers = {
|
||
|
'User-Agent':
|
||
|
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
|
||
|
content = urlopen_with_retry(url=urls, headers=headers)
|
||
|
with open(save_path, 'wb') as f:
|
||
|
f.write(content)
|
||
|
|
||
|
|
||
|
def download_from_csv_i(
|
||
|
postfix=None, save_dir=None, csv_file_path=None, is_download_main_paper=True,
|
||
|
is_download_bib=True, is_download_supplement=True,
|
||
|
time_step_in_seconds=5, total_paper_number=None,
|
||
|
downloader='IDM', is_random_step=True):
|
||
|
"""
|
||
|
download paper, bibtex and supplement files and save them to
|
||
|
save_dir/main_paper and save_dir/supplement respectively
|
||
|
:param postfix: str, postfix that will be added at the end of papers' title
|
||
|
:param save_dir: str, paper and supplement material's save path
|
||
|
:param csv_file_path: str, the full path to csv file
|
||
|
:param is_download_main_paper: bool, True for downloading main paper
|
||
|
:param is_download_supplement: bool, True for downloading supplemental
|
||
|
material
|
||
|
:param time_step_in_seconds: int, the interval time between two downloading
|
||
|
request in seconds
|
||
|
:param total_paper_number: int, the total number of papers that is going to
|
||
|
download
|
||
|
:param downloader: str, the downloader to download, could be 'IDM' or None,
|
||
|
default to 'IDM'.
|
||
|
:param is_random_step: bool, whether random sample the time step between two
|
||
|
adjacent download requests. If True, the time step will be sampled
|
||
|
from Uniform(0.5t, 1.5t), where t is the given time_step_in_seconds.
|
||
|
Default: True.
|
||
|
:return: True
|
||
|
"""
|
||
|
downloader = Downloader(
|
||
|
downloader=downloader, is_random_step=is_random_step)
|
||
|
if not os.path.exists(csv_file_path):
|
||
|
raise ValueError(f'ERROR: file not found in {csv_file_path}!!!')
|
||
|
|
||
|
main_save_path = os.path.join(save_dir, 'main_paper')
|
||
|
if is_download_main_paper:
|
||
|
os.makedirs(main_save_path, exist_ok=True)
|
||
|
if is_download_supplement:
|
||
|
supplement_save_path = os.path.join(save_dir, 'supplement')
|
||
|
os.makedirs(supplement_save_path, exist_ok=True)
|
||
|
|
||
|
error_log = []
|
||
|
with open(csv_file_path, newline='') as csvfile:
|
||
|
myreader = csv.DictReader(csvfile, delimiter=',')
|
||
|
pbar = tqdm(myreader, total=total_paper_number)
|
||
|
i = 0
|
||
|
for this_paper in pbar:
|
||
|
is_download_bib &= ('bib' in this_paper)
|
||
|
is_grouped = ('group' in this_paper)
|
||
|
i += 1
|
||
|
# get title
|
||
|
if is_grouped:
|
||
|
group = slugify(this_paper['group'])
|
||
|
title = slugify(this_paper['title'])
|
||
|
if total_paper_number is not None:
|
||
|
pbar.set_description(
|
||
|
f'Downloading {postfix} paper {i} /{total_paper_number}')
|
||
|
else:
|
||
|
pbar.set_description(f'Downloading {postfix} paper {i}')
|
||
|
this_paper_main_path = os.path.join(
|
||
|
main_save_path, f'{title}_{postfix}.pdf')
|
||
|
if is_grouped:
|
||
|
this_paper_main_path = os.path.join(
|
||
|
main_save_path, group, f'{title}_{postfix}.pdf')
|
||
|
if is_download_supplement:
|
||
|
this_paper_supp_path_no_ext = os.path.join(
|
||
|
supplement_save_path, f'{title}_{postfix}_supp.')
|
||
|
if is_grouped:
|
||
|
this_paper_supp_path_no_ext = os.path.join(
|
||
|
supplement_save_path, group, f'{title}_{postfix}_supp.')
|
||
|
if '' != this_paper['supplemental link'] and os.path.exists(
|
||
|
this_paper_main_path) and \
|
||
|
(os.path.exists(
|
||
|
this_paper_supp_path_no_ext + 'zip') or
|
||
|
os.path.exists(
|
||
|
this_paper_supp_path_no_ext + 'pdf')):
|
||
|
continue
|
||
|
elif '' == this_paper['supplemental link'] and \
|
||
|
os.path.exists(this_paper_main_path):
|
||
|
continue
|
||
|
elif os.path.exists(this_paper_main_path):
|
||
|
continue
|
||
|
if 'error' == this_paper['main link']:
|
||
|
error_log.append((title, 'no MAIN link'))
|
||
|
elif '' != this_paper['main link']:
|
||
|
if is_grouped:
|
||
|
if is_download_main_paper:
|
||
|
os.makedirs(os.path.join(main_save_path, group),
|
||
|
exist_ok=True)
|
||
|
if is_download_supplement:
|
||
|
os.makedirs(os.path.join(supplement_save_path, group),
|
||
|
exist_ok=True)
|
||
|
if is_download_main_paper:
|
||
|
try:
|
||
|
# download paper with IDM
|
||
|
if not os.path.exists(this_paper_main_path):
|
||
|
downloader.download(
|
||
|
urls=this_paper['main link'].replace(
|
||
|
' ', '%20'),
|
||
|
save_path=os.path.join(
|
||
|
os.getcwd(), this_paper_main_path),
|
||
|
time_sleep_in_seconds=time_step_in_seconds
|
||
|
)
|
||
|
except Exception as e:
|
||
|
# error_flag = True
|
||
|
print('Error: ' + title + ' - ' + str(e))
|
||
|
error_log.append((title, this_paper['main link'],
|
||
|
'main paper download error', str(e)))
|
||
|
# download supp
|
||
|
if is_download_supplement:
|
||
|
# check whether the supp can be downloaded
|
||
|
if not (os.path.exists(
|
||
|
this_paper_supp_path_no_ext + 'zip') or
|
||
|
os.path.exists(
|
||
|
this_paper_supp_path_no_ext + 'pdf')):
|
||
|
if 'error' == this_paper['supplemental link']:
|
||
|
error_log.append((title, 'no SUPPLEMENTAL link'))
|
||
|
elif '' != this_paper['supplemental link']:
|
||
|
supp_type = \
|
||
|
this_paper['supplemental link'].split('.')[-1]
|
||
|
try:
|
||
|
downloader.download(
|
||
|
urls=this_paper['supplemental link'],
|
||
|
save_path=os.path.join(
|
||
|
os.getcwd(),
|
||
|
this_paper_supp_path_no_ext + supp_type),
|
||
|
time_sleep_in_seconds=time_step_in_seconds
|
||
|
)
|
||
|
except Exception as e:
|
||
|
# error_flag = True
|
||
|
print('Error: ' + title + ' - ' + str(e))
|
||
|
error_log.append((title, this_paper[
|
||
|
'supplemental link'],
|
||
|
'supplement download error',
|
||
|
str(e)))
|
||
|
# download bibtex file
|
||
|
if is_download_bib:
|
||
|
bib_path = this_paper_main_path[:-3] + 'bib'
|
||
|
if not os.path.exists(bib_path):
|
||
|
if 'error' == this_paper['bib']:
|
||
|
error_log.append((title, 'no bibtex link'))
|
||
|
elif '' != this_paper['bib']:
|
||
|
try:
|
||
|
downloader.download(
|
||
|
urls=this_paper['bib'],
|
||
|
save_path=os.path.join(os.getcwd(),
|
||
|
bib_path),
|
||
|
time_sleep_in_seconds=time_step_in_seconds
|
||
|
)
|
||
|
except Exception as e:
|
||
|
# error_flag = True
|
||
|
print('Error: ' + title + ' - ' + str(e))
|
||
|
error_log.append((title, this_paper['bib'],
|
||
|
'bibtex download error',
|
||
|
str(e)))
|
||
|
|
||
|
# 2. write error log
|
||
|
print('write error log')
|
||
|
return True
|
||
|
|
||
|
|
||
|
def get_paper_name_link_from_url(url):
|
||
|
headers = {
|
||
|
'User-Agent':
|
||
|
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
|
||
|
paper_dict = dict()
|
||
|
content = urlopen_with_retry(url=url, headers=headers)
|
||
|
soup = BeautifulSoup(content, 'html5lib')
|
||
|
paper_list_bar = tqdm(soup.find_all(['li'], {'class': 'chapter-item content-type-list__item'}))
|
||
|
for paper in paper_list_bar:
|
||
|
try:
|
||
|
title = slugify(paper.find('div', {'class': 'content-type-list__title'}).text)
|
||
|
link = urllib.parse.urljoin(url, paper.find('div', {'class': 'content-type-list__action'}).a.get('href'))
|
||
|
paper_dict[title] = link
|
||
|
except Exception as e:
|
||
|
print(f'ERROR: {str(e)}')
|
||
|
return paper_dict
|
||
|
|
||
|
|
||
|
def urlopen_with_retry(url, headers=dict(), retry_time=3, time_out=20,
|
||
|
raise_error_if_failed=True):
|
||
|
"""
|
||
|
load content from url with given headers. Retry if error occurs.
|
||
|
Args:
|
||
|
url (str): url.
|
||
|
headers (dict): request headers. Default: {}.
|
||
|
retry_time (int): max retry time. Default: 3.
|
||
|
time_out (int): time out in seconds. Default: 10.
|
||
|
raise_error_if_failed (bool): whether to raise error if failed.
|
||
|
Default: True.
|
||
|
|
||
|
Returns:
|
||
|
content(str|None): url content. None will be returned if failed.
|
||
|
|
||
|
"""
|
||
|
res = requests.get(url=url, headers=headers)
|
||
|
|
||
|
# req = urllib.request.Request(url=url, headers=headers)
|
||
|
for r in range(retry_time):
|
||
|
try:
|
||
|
# content = urllib.request.urlopen(req, timeout=time_out).read()
|
||
|
content = res.content
|
||
|
return content
|
||
|
except HTTPError as e:
|
||
|
print('The server couldn\'t fulfill the request.')
|
||
|
print('Error code: ', e.code)
|
||
|
s = random.randint(3, 7)
|
||
|
print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}'
|
||
|
f'-th retrying...')
|
||
|
except URLError as e:
|
||
|
print('We failed to reach a server.')
|
||
|
print('Reason: ', e.reason)
|
||
|
s = random.randint(3, 7)
|
||
|
print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}'
|
||
|
f'-th retrying...')
|
||
|
if raise_error_if_failed:
|
||
|
raise ValueError(f'Failed to open {url} after trying {retry_time} '
|
||
|
f'times!')
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
def save_csv(year):
|
||
|
"""
|
||
|
write ECCV papers' and supplemental material's urls in one csv file
|
||
|
:param year: int
|
||
|
:return: True
|
||
|
"""
|
||
|
project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load"
|
||
|
csv_file_pathname = os.path.join(
|
||
|
project_root_folder, 'csv', f'ECCV_{year}.csv')
|
||
|
with open(csv_file_pathname, 'w', newline='') as csvfile:
|
||
|
fieldnames = ['title', 'main link', 'supplemental link']
|
||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||
|
writer.writeheader()
|
||
|
headers = {
|
||
|
'User-Agent':
|
||
|
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) '
|
||
|
'Gecko/20100101 Firefox/23.0'}
|
||
|
dat_file_pathname = os.path.join(
|
||
|
project_root_folder, 'urls', f'init_url_ECCV_{year}.dat')
|
||
|
if year >= 2018:
|
||
|
init_url = f'https://www.ecva.net/papers.php'
|
||
|
if os.path.exists(dat_file_pathname):
|
||
|
with open(dat_file_pathname, 'rb') as f:
|
||
|
content = pickle.load(f)
|
||
|
else:
|
||
|
content = urlopen_with_retry(url=init_url, headers=headers)
|
||
|
with open(dat_file_pathname, 'wb') as f:
|
||
|
pickle.dump(content, f)
|
||
|
soup = BeautifulSoup(content, 'html5lib')
|
||
|
paper_list_bar = tqdm(soup.find_all(['dt', 'dd']))
|
||
|
paper_index = 0
|
||
|
paper_dict = {'title': '',
|
||
|
'main link': '',
|
||
|
'supplemental link': ''}
|
||
|
for paper in paper_list_bar:
|
||
|
is_new_paper = False
|
||
|
|
||
|
# get title
|
||
|
try:
|
||
|
if 'dt' == paper.name and \
|
||
|
'ptitle' == paper.get('class')[0] and \
|
||
|
year == int(paper.a.get('href').split('_')[1][:4]): # title:
|
||
|
# this_year = int(paper.a.get('href').split('_')[1][:4])
|
||
|
title = slugify(paper.text.strip())
|
||
|
paper_dict['title'] = title
|
||
|
paper_index += 1
|
||
|
paper_list_bar.set_description_str(
|
||
|
f'Downloading paper {paper_index}: {title}')
|
||
|
elif '' != paper_dict['title'] and 'dd' == paper.name:
|
||
|
all_as = paper.find_all('a')
|
||
|
for a in all_as:
|
||
|
if 'pdf' == slugify(a.text.strip()):
|
||
|
main_link = urllib.parse.urljoin(init_url,
|
||
|
a.get('href'))
|
||
|
paper_dict['main link'] = main_link
|
||
|
is_new_paper = True
|
||
|
elif 'supp' == slugify(a.text.strip())[:4]:
|
||
|
supp_link = urllib.parse.urljoin(init_url,
|
||
|
a.get('href'))
|
||
|
paper_dict['supplemental link'] = supp_link
|
||
|
break
|
||
|
except:
|
||
|
pass
|
||
|
if is_new_paper:
|
||
|
writer.writerow(paper_dict)
|
||
|
paper_dict = {'title': '',
|
||
|
'main link': '',
|
||
|
'supplemental link': ''}
|
||
|
else:
|
||
|
init_url = f'http://www.eccv{year}.org/main-conference/'
|
||
|
if os.path.exists(dat_file_pathname):
|
||
|
with open(dat_file_pathname, 'rb') as f:
|
||
|
content = pickle.load(f)
|
||
|
else:
|
||
|
content = urlopen_with_retry(url=init_url, headers=headers)
|
||
|
with open(dat_file_pathname, 'wb') as f:
|
||
|
pickle.dump(content, f)
|
||
|
soup = BeautifulSoup(content, 'html5lib')
|
||
|
paper_list_bar = tqdm(
|
||
|
soup.find('div', {'class': 'entry-content'}).find_all(['p']))
|
||
|
paper_index = 0
|
||
|
paper_dict = {'title': '',
|
||
|
'main link': '',
|
||
|
'supplemental link': ''}
|
||
|
for paper in paper_list_bar:
|
||
|
try:
|
||
|
if len(paper.find_all(['strong'])) and len(
|
||
|
paper.find_all(['a'])) and len(paper.find_all(['img'])):
|
||
|
paper_index += 1
|
||
|
title = slugify(paper.find('strong').text)
|
||
|
paper_dict['title'] = title
|
||
|
paper_list_bar.set_description_str(
|
||
|
f'Downloading paper {paper_index}: {title}')
|
||
|
main_link = paper.find('a').get('href')
|
||
|
paper_dict['main link'] = main_link
|
||
|
writer.writerow(paper_dict)
|
||
|
paper_dict = {'title': '',
|
||
|
'main link': '',
|
||
|
'supplemental link': ''}
|
||
|
except Exception as e:
|
||
|
print(f'ERROR: {str(e)}')
|
||
|
return paper_index
|
||
|
|
||
|
|
||
|
def download_from_csv(
|
||
|
year, save_dir, is_download_supplement=True, time_step_in_seconds=5,
|
||
|
total_paper_number=None,
|
||
|
is_workshops=False, downloader='IDM'):
|
||
|
"""
|
||
|
download all ECCV paper and supplement files given year, restore in
|
||
|
save_dir/main_paper and save_dir/supplement respectively
|
||
|
:param year: int, ECCV year, such 2019
|
||
|
:param save_dir: str, paper and supplement material's save path
|
||
|
:param is_download_supplement: bool, True for downloading supplemental
|
||
|
material
|
||
|
:param time_step_in_seconds: int, the interval time between two downlaod
|
||
|
request in seconds
|
||
|
:param total_paper_number: int, the total number of papers that is going
|
||
|
to download
|
||
|
:param is_workshops: bool, is to download workshops from csv file.
|
||
|
:param downloader: str, the downloader to download, could be 'IDM' or
|
||
|
'Thunder', default to 'IDM'
|
||
|
:return: True
|
||
|
"""
|
||
|
postfix = f'ECCV_{year}'
|
||
|
if is_workshops:
|
||
|
postfix = f'ECCV_WS_{year}'
|
||
|
csv_file_name = f'ECCV_{year}.csv' if not is_workshops else \
|
||
|
f'ECCV_WS_{year}.csv'
|
||
|
project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load"
|
||
|
csv_file_name = os.path.join(project_root_folder, 'csv', csv_file_name)
|
||
|
download_from_csv_i(
|
||
|
postfix=postfix,
|
||
|
save_dir=save_dir,
|
||
|
csv_file_path=csv_file_name,
|
||
|
is_download_supplement=is_download_supplement,
|
||
|
time_step_in_seconds=time_step_in_seconds,
|
||
|
total_paper_number=total_paper_number,
|
||
|
downloader=downloader
|
||
|
)
|
||
|
|
||
|
|
||
|
def download_from_springer(
|
||
|
year, save_dir, is_workshops=False, time_sleep_in_seconds=5,
|
||
|
downloader='IDM'):
|
||
|
os.makedirs(save_dir, exist_ok=True)
|
||
|
if 2018 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01246-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01216-8',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01219-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01225-0',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01228-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01231-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01234-2',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01237-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01240-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01249-6',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01252-6',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01258-8',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01261-8',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01264-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01267-0',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-01270-0'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11009-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11012-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11015-4',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11018-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11021-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-030-11024-6'
|
||
|
]
|
||
|
elif 2016 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46448-0',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46475-6',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46487-9',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46493-0',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46454-1',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46466-4',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46478-7',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46484-8'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-46604-0',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-48881-3',
|
||
|
'https://link.springer.com/book/10.1007%2F978-3-319-49409-8'
|
||
|
]
|
||
|
elif 2014 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10590-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10605-2',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10578-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10593-2',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10602-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10599-4',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-10584-0'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-16178-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-16181-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-16199-0',
|
||
|
'https://link.springer.com/book/10.1007/978-3-319-16220-1'
|
||
|
]
|
||
|
elif 2012 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33718-5',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33709-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33712-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33765-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33715-4',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33783-3',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33786-4'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33863-2',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33868-7',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-33885-4'
|
||
|
]
|
||
|
elif 2010 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15549-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15552-9',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15558-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15561-1',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15555-0',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-15567-3'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-35749-7',
|
||
|
'https://link.springer.com/book/10.1007/978-3-642-35740-4'
|
||
|
]
|
||
|
elif 2008 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/978-3-540-88682-2',
|
||
|
'https://link.springer.com/book/10.1007/978-3-540-88688-4',
|
||
|
'https://link.springer.com/book/10.1007/978-3-540-88690-7',
|
||
|
'https://link.springer.com/book/10.1007/978-3-540-88693-8'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = []
|
||
|
elif 2006 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/11744023',
|
||
|
'https://link.springer.com/book/10.1007/11744047',
|
||
|
'https://link.springer.com/book/10.1007/11744078',
|
||
|
'https://link.springer.com/book/10.1007/11744085'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/11754336'
|
||
|
]
|
||
|
elif 2004 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/b97865',
|
||
|
'https://link.springer.com/book/10.1007/b97866',
|
||
|
'https://link.springer.com/book/10.1007/b97871',
|
||
|
'https://link.springer.com/book/10.1007/b97873'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 2002 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/3-540-47969-4',
|
||
|
'https://link.springer.com/book/10.1007/3-540-47967-8',
|
||
|
'https://link.springer.com/book/10.1007/3-540-47977-5',
|
||
|
'https://link.springer.com/book/10.1007/3-540-47979-1'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 2000 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/3-540-45054-8',
|
||
|
'https://link.springer.com/book/10.1007/3-540-45053-X'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 1998 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/BFb0055655',
|
||
|
'https://link.springer.com/book/10.1007/BFb0054729'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 1996 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/BFb0015518',
|
||
|
'https://link.springer.com/book/10.1007/3-540-61123-1'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 1994 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/3-540-57956-7',
|
||
|
'https://link.springer.com/book/10.1007/BFb0028329'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 1992 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/3-540-55426-2'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
elif 1990 == year:
|
||
|
if not is_workshops:
|
||
|
urls_list = [
|
||
|
'https://link.springer.com/book/10.1007/BFb0014843'
|
||
|
]
|
||
|
else:
|
||
|
urls_list = [
|
||
|
|
||
|
]
|
||
|
else:
|
||
|
raise ValueError(f'ECCV {year} is current not available!')
|
||
|
for url in urls_list:
|
||
|
__download_from_springer(
|
||
|
url, save_dir, year, is_workshops=is_workshops,
|
||
|
time_sleep_in_seconds=time_sleep_in_seconds,
|
||
|
downloader=downloader)
|
||
|
|
||
|
|
||
|
def __download_from_springer(
|
||
|
url, save_dir, year, is_workshops=False, time_sleep_in_seconds=5,
|
||
|
downloader='IDM'):
|
||
|
downloader = Downloader(downloader)
|
||
|
for i in range(3):
|
||
|
try:
|
||
|
papers_dict = get_paper_name_link_from_url(url)
|
||
|
break
|
||
|
except Exception as e:
|
||
|
print(str(e))
|
||
|
# total_paper_number = len(papers_dict)
|
||
|
pbar = tqdm(papers_dict.keys())
|
||
|
postfix = f'ECCV_{year}'
|
||
|
if is_workshops:
|
||
|
postfix = f'ECCV_WS_{year}'
|
||
|
|
||
|
for name in pbar:
|
||
|
pbar.set_description(f'Downloading paper {name}')
|
||
|
if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')):
|
||
|
downloader.download(
|
||
|
papers_dict[name],
|
||
|
os.path.join(save_dir, f'{name}_{postfix}.pdf'),
|
||
|
time_sleep_in_seconds)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
year = 2022
|
||
|
# total_paper_number = 1645
|
||
|
total_paper_number = save_csv(year)
|
||
|
download_from_csv(year,
|
||
|
save_dir=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}',
|
||
|
is_download_supplement=False,
|
||
|
time_step_in_seconds=5,
|
||
|
total_paper_number=total_paper_number,
|
||
|
is_workshops=False)
|
||
|
# move_main_and_supplement_2_one_directory(
|
||
|
# main_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper',
|
||
|
# supplement_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\supplement',
|
||
|
# supp_pdf_save_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper'
|
||
|
# )
|
||
|
# for year in range(2018, 2017, -2):
|
||
|
# # download_from_springer(
|
||
|
# # save_dir=f'F:\\ECCV_{year}',
|
||
|
# # year=year,
|
||
|
# # is_workshops=False, time_sleep_in_seconds=30)
|
||
|
# download_from_springer(
|
||
|
# save_dir=f'F:\\ECCV_WS_{year}',
|
||
|
# year=year,
|
||
|
# is_workshops=True, time_sleep_in_seconds=30)
|
||
|
# pass
|