from bs4 import BeautifulSoup import pickle import os os.environ['http_proxy'] = '127.0.0.1:7890' os.environ['https_proxy'] = '127.0.0.1:7890' from tqdm import tqdm from slugify import slugify import csv import sys import urllib import random from urllib.error import URLError, HTTPError import requests class Downloader: def __init__(self, downloader=None, is_random_step=None): pass def download(self, urls=None, save_path=None, time_sleep_in_seconds=None): print(urls) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} content = urlopen_with_retry(url=urls, headers=headers) with open(save_path, 'wb') as f: f.write(content) def download_from_csv_i( postfix=None, save_dir=None, csv_file_path=None, is_download_main_paper=True, is_download_bib=True, is_download_supplement=True, time_step_in_seconds=5, total_paper_number=None, downloader='IDM', is_random_step=True): """ download paper, bibtex and supplement files and save them to save_dir/main_paper and save_dir/supplement respectively :param postfix: str, postfix that will be added at the end of papers' title :param save_dir: str, paper and supplement material's save path :param csv_file_path: str, the full path to csv file :param is_download_main_paper: bool, True for downloading main paper :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downloading request in seconds :param total_paper_number: int, the total number of papers that is going to download :param downloader: str, the downloader to download, could be 'IDM' or None, default to 'IDM'. :param is_random_step: bool, whether random sample the time step between two adjacent download requests. If True, the time step will be sampled from Uniform(0.5t, 1.5t), where t is the given time_step_in_seconds. Default: True. :return: True """ downloader = Downloader( downloader=downloader, is_random_step=is_random_step) if not os.path.exists(csv_file_path): raise ValueError(f'ERROR: file not found in {csv_file_path}!!!') main_save_path = os.path.join(save_dir, 'main_paper') if is_download_main_paper: os.makedirs(main_save_path, exist_ok=True) if is_download_supplement: supplement_save_path = os.path.join(save_dir, 'supplement') os.makedirs(supplement_save_path, exist_ok=True) error_log = [] with open(csv_file_path, newline='') as csvfile: myreader = csv.DictReader(csvfile, delimiter=',') pbar = tqdm(myreader, total=total_paper_number) i = 0 for this_paper in pbar: is_download_bib &= ('bib' in this_paper) is_grouped = ('group' in this_paper) i += 1 # get title if is_grouped: group = slugify(this_paper['group']) title = slugify(this_paper['title']) if total_paper_number is not None: pbar.set_description( f'Downloading {postfix} paper {i} /{total_paper_number}') else: pbar.set_description(f'Downloading {postfix} paper {i}') this_paper_main_path = os.path.join( main_save_path, f'{title}_{postfix}.pdf') if is_grouped: this_paper_main_path = os.path.join( main_save_path, group, f'{title}_{postfix}.pdf') if is_download_supplement: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, f'{title}_{postfix}_supp.') if is_grouped: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, group, f'{title}_{postfix}_supp.') if '' != this_paper['supplemental link'] and os.path.exists( this_paper_main_path) and \ (os.path.exists( this_paper_supp_path_no_ext + 'zip') or os.path.exists( this_paper_supp_path_no_ext + 'pdf')): continue elif '' == this_paper['supplemental link'] and \ os.path.exists(this_paper_main_path): continue elif os.path.exists(this_paper_main_path): continue if 'error' == this_paper['main link']: error_log.append((title, 'no MAIN link')) elif '' != this_paper['main link']: if is_grouped: if is_download_main_paper: os.makedirs(os.path.join(main_save_path, group), exist_ok=True) if is_download_supplement: os.makedirs(os.path.join(supplement_save_path, group), exist_ok=True) if is_download_main_paper: try: # download paper with IDM if not os.path.exists(this_paper_main_path): downloader.download( urls=this_paper['main link'].replace( ' ', '%20'), save_path=os.path.join( os.getcwd(), this_paper_main_path), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, this_paper['main link'], 'main paper download error', str(e))) # download supp if is_download_supplement: # check whether the supp can be downloaded if not (os.path.exists( this_paper_supp_path_no_ext + 'zip') or os.path.exists( this_paper_supp_path_no_ext + 'pdf')): if 'error' == this_paper['supplemental link']: error_log.append((title, 'no SUPPLEMENTAL link')) elif '' != this_paper['supplemental link']: supp_type = \ this_paper['supplemental link'].split('.')[-1] try: downloader.download( urls=this_paper['supplemental link'], save_path=os.path.join( os.getcwd(), this_paper_supp_path_no_ext + supp_type), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, this_paper[ 'supplemental link'], 'supplement download error', str(e))) # download bibtex file if is_download_bib: bib_path = this_paper_main_path[:-3] + 'bib' if not os.path.exists(bib_path): if 'error' == this_paper['bib']: error_log.append((title, 'no bibtex link')) elif '' != this_paper['bib']: try: downloader.download( urls=this_paper['bib'], save_path=os.path.join(os.getcwd(), bib_path), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, this_paper['bib'], 'bibtex download error', str(e))) # 2. write error log print('write error log') return True def get_paper_name_link_from_url(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} paper_dict = dict() content = urlopen_with_retry(url=url, headers=headers) soup = BeautifulSoup(content, 'html5lib') paper_list_bar = tqdm(soup.find_all(['li'], {'class': 'chapter-item content-type-list__item'})) for paper in paper_list_bar: try: title = slugify(paper.find('div', {'class': 'content-type-list__title'}).text) link = urllib.parse.urljoin(url, paper.find('div', {'class': 'content-type-list__action'}).a.get('href')) paper_dict[title] = link except Exception as e: print(f'ERROR: {str(e)}') return paper_dict def urlopen_with_retry(url, headers=dict(), retry_time=3, time_out=20, raise_error_if_failed=True): """ load content from url with given headers. Retry if error occurs. Args: url (str): url. headers (dict): request headers. Default: {}. retry_time (int): max retry time. Default: 3. time_out (int): time out in seconds. Default: 10. raise_error_if_failed (bool): whether to raise error if failed. Default: True. Returns: content(str|None): url content. None will be returned if failed. """ res = requests.get(url=url, headers=headers) # req = urllib.request.Request(url=url, headers=headers) for r in range(retry_time): try: # content = urllib.request.urlopen(req, timeout=time_out).read() content = res.content return content except HTTPError as e: print('The server couldn\'t fulfill the request.') print('Error code: ', e.code) s = random.randint(3, 7) print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}' f'-th retrying...') except URLError as e: print('We failed to reach a server.') print('Reason: ', e.reason) s = random.randint(3, 7) print(f'random sleeping {s} seconds and doing {r + 1}/{retry_time}' f'-th retrying...') if raise_error_if_failed: raise ValueError(f'Failed to open {url} after trying {retry_time} ' f'times!') else: return None def save_csv(year): """ write ECCV papers' and supplemental material's urls in one csv file :param year: int :return: True """ project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load" csv_file_pathname = os.path.join( project_root_folder, 'csv', f'ECCV_{year}.csv') with open(csv_file_pathname, 'w', newline='') as csvfile: fieldnames = ['title', 'main link', 'supplemental link'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) ' 'Gecko/20100101 Firefox/23.0'} dat_file_pathname = os.path.join( project_root_folder, 'urls', f'init_url_ECCV_{year}.dat') if year >= 2018: init_url = f'https://www.ecva.net/papers.php' if os.path.exists(dat_file_pathname): with open(dat_file_pathname, 'rb') as f: content = pickle.load(f) else: content = urlopen_with_retry(url=init_url, headers=headers) with open(dat_file_pathname, 'wb') as f: pickle.dump(content, f) soup = BeautifulSoup(content, 'html5lib') paper_list_bar = tqdm(soup.find_all(['dt', 'dd'])) paper_index = 0 paper_dict = {'title': '', 'main link': '', 'supplemental link': ''} for paper in paper_list_bar: is_new_paper = False # get title try: if 'dt' == paper.name and \ 'ptitle' == paper.get('class')[0] and \ year == int(paper.a.get('href').split('_')[1][:4]): # title: # this_year = int(paper.a.get('href').split('_')[1][:4]) title = slugify(paper.text.strip()) paper_dict['title'] = title paper_index += 1 paper_list_bar.set_description_str( f'Downloading paper {paper_index}: {title}') elif '' != paper_dict['title'] and 'dd' == paper.name: all_as = paper.find_all('a') for a in all_as: if 'pdf' == slugify(a.text.strip()): main_link = urllib.parse.urljoin(init_url, a.get('href')) paper_dict['main link'] = main_link is_new_paper = True elif 'supp' == slugify(a.text.strip())[:4]: supp_link = urllib.parse.urljoin(init_url, a.get('href')) paper_dict['supplemental link'] = supp_link break except: pass if is_new_paper: writer.writerow(paper_dict) paper_dict = {'title': '', 'main link': '', 'supplemental link': ''} else: init_url = f'http://www.eccv{year}.org/main-conference/' if os.path.exists(dat_file_pathname): with open(dat_file_pathname, 'rb') as f: content = pickle.load(f) else: content = urlopen_with_retry(url=init_url, headers=headers) with open(dat_file_pathname, 'wb') as f: pickle.dump(content, f) soup = BeautifulSoup(content, 'html5lib') paper_list_bar = tqdm( soup.find('div', {'class': 'entry-content'}).find_all(['p'])) paper_index = 0 paper_dict = {'title': '', 'main link': '', 'supplemental link': ''} for paper in paper_list_bar: try: if len(paper.find_all(['strong'])) and len( paper.find_all(['a'])) and len(paper.find_all(['img'])): paper_index += 1 title = slugify(paper.find('strong').text) paper_dict['title'] = title paper_list_bar.set_description_str( f'Downloading paper {paper_index}: {title}') main_link = paper.find('a').get('href') paper_dict['main link'] = main_link writer.writerow(paper_dict) paper_dict = {'title': '', 'main link': '', 'supplemental link': ''} except Exception as e: print(f'ERROR: {str(e)}') return paper_index def download_from_csv( year, save_dir, is_download_supplement=True, time_step_in_seconds=5, total_paper_number=None, is_workshops=False, downloader='IDM'): """ download all ECCV paper and supplement files given year, restore in save_dir/main_paper and save_dir/supplement respectively :param year: int, ECCV year, such 2019 :param save_dir: str, paper and supplement material's save path :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param total_paper_number: int, the total number of papers that is going to download :param is_workshops: bool, is to download workshops from csv file. :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ postfix = f'ECCV_{year}' if is_workshops: postfix = f'ECCV_WS_{year}' csv_file_name = f'ECCV_{year}.csv' if not is_workshops else \ f'ECCV_WS_{year}.csv' project_root_folder = r"D:\py\keyan_qingbao\te_u\paper_down_load" csv_file_name = os.path.join(project_root_folder, 'csv', csv_file_name) download_from_csv_i( postfix=postfix, save_dir=save_dir, csv_file_path=csv_file_name, is_download_supplement=is_download_supplement, time_step_in_seconds=time_step_in_seconds, total_paper_number=total_paper_number, downloader=downloader ) def download_from_springer( year, save_dir, is_workshops=False, time_sleep_in_seconds=5, downloader='IDM'): os.makedirs(save_dir, exist_ok=True) if 2018 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-030-01246-5', 'https://link.springer.com/book/10.1007/978-3-030-01216-8', 'https://link.springer.com/book/10.1007/978-3-030-01219-9', 'https://link.springer.com/book/10.1007/978-3-030-01225-0', 'https://link.springer.com/book/10.1007/978-3-030-01228-1', 'https://link.springer.com/book/10.1007/978-3-030-01231-1', 'https://link.springer.com/book/10.1007/978-3-030-01234-2', 'https://link.springer.com/book/10.1007/978-3-030-01237-3', 'https://link.springer.com/book/10.1007/978-3-030-01240-3', 'https://link.springer.com/book/10.1007/978-3-030-01249-6', 'https://link.springer.com/book/10.1007/978-3-030-01252-6', 'https://link.springer.com/book/10.1007/978-3-030-01258-8', 'https://link.springer.com/book/10.1007/978-3-030-01261-8', 'https://link.springer.com/book/10.1007/978-3-030-01264-9', 'https://link.springer.com/book/10.1007/978-3-030-01267-0', 'https://link.springer.com/book/10.1007/978-3-030-01270-0' ] else: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-030-11009-3', 'https://link.springer.com/book/10.1007/978-3-030-11012-3', 'https://link.springer.com/book/10.1007/978-3-030-11015-4', 'https://link.springer.com/book/10.1007/978-3-030-11018-5', 'https://link.springer.com/book/10.1007/978-3-030-11021-5', 'https://link.springer.com/book/10.1007/978-3-030-11024-6' ] elif 2016 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007%2F978-3-319-46448-0', 'https://link.springer.com/book/10.1007%2F978-3-319-46475-6', 'https://link.springer.com/book/10.1007%2F978-3-319-46487-9', 'https://link.springer.com/book/10.1007%2F978-3-319-46493-0', 'https://link.springer.com/book/10.1007%2F978-3-319-46454-1', 'https://link.springer.com/book/10.1007%2F978-3-319-46466-4', 'https://link.springer.com/book/10.1007%2F978-3-319-46478-7', 'https://link.springer.com/book/10.1007%2F978-3-319-46484-8' ] else: urls_list = [ 'https://link.springer.com/book/10.1007%2F978-3-319-46604-0', 'https://link.springer.com/book/10.1007%2F978-3-319-48881-3', 'https://link.springer.com/book/10.1007%2F978-3-319-49409-8' ] elif 2014 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-319-10590-1', 'https://link.springer.com/book/10.1007/978-3-319-10605-2', 'https://link.springer.com/book/10.1007/978-3-319-10578-9', 'https://link.springer.com/book/10.1007/978-3-319-10593-2', 'https://link.springer.com/book/10.1007/978-3-319-10602-1', 'https://link.springer.com/book/10.1007/978-3-319-10599-4', 'https://link.springer.com/book/10.1007/978-3-319-10584-0' ] else: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-319-16178-5', 'https://link.springer.com/book/10.1007/978-3-319-16181-5', 'https://link.springer.com/book/10.1007/978-3-319-16199-0', 'https://link.springer.com/book/10.1007/978-3-319-16220-1' ] elif 2012 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-642-33718-5', 'https://link.springer.com/book/10.1007/978-3-642-33709-3', 'https://link.springer.com/book/10.1007/978-3-642-33712-3', 'https://link.springer.com/book/10.1007/978-3-642-33765-9', 'https://link.springer.com/book/10.1007/978-3-642-33715-4', 'https://link.springer.com/book/10.1007/978-3-642-33783-3', 'https://link.springer.com/book/10.1007/978-3-642-33786-4' ] else: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-642-33863-2', 'https://link.springer.com/book/10.1007/978-3-642-33868-7', 'https://link.springer.com/book/10.1007/978-3-642-33885-4' ] elif 2010 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-642-15549-9', 'https://link.springer.com/book/10.1007/978-3-642-15552-9', 'https://link.springer.com/book/10.1007/978-3-642-15558-1', 'https://link.springer.com/book/10.1007/978-3-642-15561-1', 'https://link.springer.com/book/10.1007/978-3-642-15555-0', 'https://link.springer.com/book/10.1007/978-3-642-15567-3' ] else: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-642-35749-7', 'https://link.springer.com/book/10.1007/978-3-642-35740-4' ] elif 2008 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/978-3-540-88682-2', 'https://link.springer.com/book/10.1007/978-3-540-88688-4', 'https://link.springer.com/book/10.1007/978-3-540-88690-7', 'https://link.springer.com/book/10.1007/978-3-540-88693-8' ] else: urls_list = [] elif 2006 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/11744023', 'https://link.springer.com/book/10.1007/11744047', 'https://link.springer.com/book/10.1007/11744078', 'https://link.springer.com/book/10.1007/11744085' ] else: urls_list = [ 'https://link.springer.com/book/10.1007/11754336' ] elif 2004 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/b97865', 'https://link.springer.com/book/10.1007/b97866', 'https://link.springer.com/book/10.1007/b97871', 'https://link.springer.com/book/10.1007/b97873' ] else: urls_list = [ ] elif 2002 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/3-540-47969-4', 'https://link.springer.com/book/10.1007/3-540-47967-8', 'https://link.springer.com/book/10.1007/3-540-47977-5', 'https://link.springer.com/book/10.1007/3-540-47979-1' ] else: urls_list = [ ] elif 2000 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/3-540-45054-8', 'https://link.springer.com/book/10.1007/3-540-45053-X' ] else: urls_list = [ ] elif 1998 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/BFb0055655', 'https://link.springer.com/book/10.1007/BFb0054729' ] else: urls_list = [ ] elif 1996 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/BFb0015518', 'https://link.springer.com/book/10.1007/3-540-61123-1' ] else: urls_list = [ ] elif 1994 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/3-540-57956-7', 'https://link.springer.com/book/10.1007/BFb0028329' ] else: urls_list = [ ] elif 1992 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/3-540-55426-2' ] else: urls_list = [ ] elif 1990 == year: if not is_workshops: urls_list = [ 'https://link.springer.com/book/10.1007/BFb0014843' ] else: urls_list = [ ] else: raise ValueError(f'ECCV {year} is current not available!') for url in urls_list: __download_from_springer( url, save_dir, year, is_workshops=is_workshops, time_sleep_in_seconds=time_sleep_in_seconds, downloader=downloader) def __download_from_springer( url, save_dir, year, is_workshops=False, time_sleep_in_seconds=5, downloader='IDM'): downloader = Downloader(downloader) for i in range(3): try: papers_dict = get_paper_name_link_from_url(url) break except Exception as e: print(str(e)) # total_paper_number = len(papers_dict) pbar = tqdm(papers_dict.keys()) postfix = f'ECCV_{year}' if is_workshops: postfix = f'ECCV_WS_{year}' for name in pbar: pbar.set_description(f'Downloading paper {name}') if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')): downloader.download( papers_dict[name], os.path.join(save_dir, f'{name}_{postfix}.pdf'), time_sleep_in_seconds) if __name__ == '__main__': year = 2022 # total_paper_number = 1645 total_paper_number = save_csv(year) download_from_csv(year, save_dir=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}', is_download_supplement=False, time_step_in_seconds=5, total_paper_number=total_paper_number, is_workshops=False) # move_main_and_supplement_2_one_directory( # main_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper', # supplement_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\supplement', # supp_pdf_save_path=fr'D:\py\keyan_qingbao\te_u\paper_down_load\ECCV_{year}\main_paper' # ) # for year in range(2018, 2017, -2): # # download_from_springer( # # save_dir=f'F:\\ECCV_{year}', # # year=year, # # is_workshops=False, time_sleep_in_seconds=30) # download_from_springer( # save_dir=f'F:\\ECCV_WS_{year}', # year=year, # is_workshops=True, time_sleep_in_seconds=30) # pass