import requests import pickle import time from lib import utils from lib.parser import dom_node, simple_parser import socket import socks use_proxy = False if use_proxy: SOCKS5_PROXY_HOST = '127.0.0.1' SOCKS5_PROXY_PORT = 1080 default_socket = socket.socket socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT) socket.socket = socks.socksocket class arxiv_paper(): def __init__(self, arxiv_id = None, paper_info = None): self.arxiv_id = arxiv_id self.info = paper_info def add_author(self, author): self.info['authors'].append(authors) def title(self): return self.info['title'] def describe(self): information = '' information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id) for key in self.info: if self.info[key] is not None: info = utils.formal_text(self.info[key]) information += ('\t' + key + ':' + str(info) + '\n') return information def show(self): print(self.describe()) def to_html(self): dom_tree = dom_node(name = 'paper-section') paper_title = None paper_link = None paper_authors = None paper_comments = None paper_subjects = None paper_abstract = None for key in self.info: if self.info[key] is not None: if key == 'title': paper_title = dom_node('paper-title') link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)} link_node = dom_node('a', link_attr) link_node.data = self.info[key] paper_title.add_child(link_node) paper_link = dom_node('paper-pdf-link') pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)} pdf_link = dom_node('a', pdf_link_attr) pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id) paper_link.add_child(pdf_link) elif key == 'authors': paper_authors = dom_node('paper-authors') authors_string = '' for author in self.info[key]: authors_string += author + ', ' authors_string = authors_string[:-2] paper_authors.data = authors_string elif key == 'comments': paper_comments = dom_node('paper-comments') paper_comments.data = self.info[key] elif key == 'subjects': paper_subjects = dom_node('paper-subjects') paper_subjects.data = self.info[key] elif key == 'abstract': paper_abstract = dom_node('paper-abstract') paper_abstract.data = self.info[key] dom_tree.add_child(paper_title) dom_tree.add_child(paper_link) dom_tree.add_child(paper_authors) dom_tree.add_child(paper_abstract) dom_tree.add_child(paper_comments) dom_tree.add_child(paper_subjects) html = dom_tree.to_string() return html def download_abstract(self, forcemode=False): if not forcemode: if self.info['abstract'] is not None: # print('skipping download abstract since already downloaded') return; r = requests.get('https://arxiv.org/abs/' + self.arxiv_id) parser = simple_parser() parser.feed(r.text) tree = parser.root meta_nodes = tree.search('meta') for meta_node in meta_nodes: meta_attr = meta_node.attributes if 'property' in meta_attr: if meta_attr['property'] == 'og:description': self.info['abstract'] = utils.formal_text(meta_attr['content']) return; class arxiv_list_parser(): def __init__(self, html_page): self.html_page = html_page self.parser = simple_parser() self.parser.feed(html_page) self.tree = self.parser.root def get_arxiv_id(self, dt_node): if len(dt_node.childs) == 0: return None else: arxiv_id = dt_node.childs[1].childs[0].attributes['href'] arxiv_id = arxiv_id.split('/')[-1] return arxiv_id def get_paper_info(self, dd_node): title = None authors = [] comments = None subjects = None if len(dd_node.childs) == 0: return None else: elements = dd_node.childs[0].childs for element in elements: if 'class' in element.attributes: element_class = element.attributes['class'] if element_class == 'list-title mathjax': title = utils.formal_text(element.data) elif element_class == 'list-authors': for child in element.childs: if child.name == 'a': authors.append(utils.formal_text(child.data)) elif element_class == 'list-comments mathjax': comments = utils.formal_text(element.data) elif element_class == 'list-subjects': subjects = utils.formal_text(element.data) paper_info = { 'title':title, 'authors':authors, 'comments':comments, 'subjects':subjects, 'abstract':None } return paper_info def get_papers(self): dts = self.tree.search('dt') dds = self.tree.search('dd') papers = [] for dt, dd in zip(dts, dds): arxiv_id = self.get_arxiv_id(dt) if arxiv_id == None: continue; paper_info = self.get_paper_info(dd) if paper_info == None: continue; paper = arxiv_paper(arxiv_id, paper_info) papers.append(paper) return papers def get_paper_num(self): totally_paper_node = self.tree.search('small')[0].data total_num_split = totally_paper_node.split(' ') num_total = 0 for split in total_num_split: if split.isdigit(): num_total = int(split) break; return num_total def get_recent_info(self): # get each day start id and day_name day_name = [] day_start = [] li_nodes = self.tree.search('ul')[0].childs for li in li_nodes: link = li.childs[0].attributes['href'] start = None if link.find('#item') != -1: start = link.split('#')[-1][4:] else: start = link.split('=')[-2].split('&')[0] day_name.append(li.childs[0].data) day_start.append(int(start)) # get total paper num num_total = self.get_paper_num() # get each day num. num_days = len(day_start) day_num = [] for i in range(num_days): if i < num_days - 1: day_num.append(day_start[i+1] - day_start[i]) else: day_num.append(num_total - day_start[i]) # generate final info. recent_papers_info = {} for day, start, num in zip(day_name, day_start, day_num): current_day_info = {} current_day_info['start'] = start current_day_info['num'] = num recent_papers_info[day] = current_day_info return recent_papers_info class arxiv_spider(): def __init__(self, topic, arxiv_url = 'https://arxiv.org'): self.link = arxiv_url self.topic = topic self.base_url = self.link + '/list/' + self.topic def get_yearly_papers(self, year, log=False): yearly_url = self.base_url + '/' + year if log: print('visiting url [{0}] for basic information'.format(yearly_url)) r = requests.get(yearly_url) list_parser = arxiv_list_parser(r.text) total_num = list_parser.get_paper_num() print('Total Number for this year:', total_num) yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num) if log: print('visiting url [{0}] for all papers'.format(yearly_url_all)) r = requests.get(yearly_url_all) list_parser = arxiv_list_parser(r.text) yearly_papers = list_parser.get_papers() return yearly_papers # papers: # papers = { # 'key is day string': [content is a list of arxiv_paper class] # } def get_papers_on_search_list(self, search_url, log=True): if log: print('visiting url [{0}] for today papers.'.format(search_url)) search_content = requests.get(search_url) search_content = search_content.text parser = simple_parser() parser.feed(search_content) tree = parser.root paper_nodes = tree.search('entry') print('num_searched_nodes:', len(paper_nodes)) papers = [] for node in paper_nodes: arxiv_id = node.search('id')[0].data.split('/')[-1] title = node.search('title')[0].data author_nodes = node.search('name') authors = [item.data for item in author_nodes] category_nodes = node.search('category') categories = [item.attributes['term'] for item in category_nodes] subjects = '' for cat in categories: subjects += cat + ',' subjects = subjects[:-1] comments_node = node.search('arxiv:comment') if len(comments_node) == 0: comments = '' else: comments = node.search('arxiv:comment')[0].data abstract = node.search('summary')[0].data title = utils.formal_text(title) subjects = utils.formal_text(subjects) comments = utils.formal_text(comments) abstract = utils.formal_text(abstract) paper_info = { 'title':title, 'authors':authors, 'comments':comments, 'subjects':subjects, 'abstract':abstract } paper = arxiv_paper(arxiv_id, paper_info) papers.append(paper) return papers def get_papers_by_ids(self, ids, log=True): num_groups = int((len(ids) + 9.1)/10) if log: print('spliting into {0} groups.'.format(num_groups)) papers = [] for i in range(num_groups): this_batch = ids[i * 10:(i+1)*10] id_list = '' for paper_id in this_batch: id_list += paper_id + ',' id_list = id_list[:-1] search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list batch_papers = self.get_papers_on_search_list(search_url, log) papers += batch_papers return papers def get_today_ids(self, log=True): rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic) if log: print('visiting url [{0}] for today papers id.'.format(rss_url)) rss_content = requests.get(rss_url) rss_content = rss_content.text parser = simple_parser() parser.feed(rss_content) rss = parser.root id_nodes = rss.search('rdf:li') paper_ids = [] for node in id_nodes: paper_link = node.attributes['rdf:resource'] paper_id = paper_link.split('/')[-1] paper_ids.append(paper_id) print('num_paper_ids:', len(paper_ids)) return paper_ids def get_today_paper(self, return_day_name=False, log=True): today_ids = self.get_today_ids(log) papers = self.get_papers_by_ids(today_ids) print('num of papers:', len(papers)) return papers def get_today_paper_backup(self, return_day_name=False): papers = self.get_recent_papers(recent_days=[1]) today = None paper = None for day in papers: today = day paper = papers[day] if return_day_name: return paper, today else: return paper def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False): recent_url = self.base_url + '/recent' if log: print('visiting url [{0}] for basic information'.format(recent_url)) r = requests.get(recent_url) list_parser = arxiv_list_parser(r.text) recent_papers_info = list_parser.get_recent_info() print('paper info:', recent_papers_info) day_id = 1 papers = {} for day in recent_papers_info: if day_id in recent_days: today_start = recent_papers_info[day]['start'] today_num = recent_papers_info[day]['num'] page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num) day_url = self.base_url + page_url if log: print('visiting url [{0}] for paper on day {1}'.format(day_url, day)) r = requests.get(day_url) list_parser = arxiv_list_parser(r.text) today_papers = list_parser.get_papers() papers[day] = today_papers day_id += 1 return papers