ArxivRobot/arxiv_spider.py
2019-12-23 12:42:31 +08:00

370 lines
14 KiB
Python

import requests
import pickle
import time
from lib import utils
from lib.parser import dom_node, simple_parser
import socket
import socks
use_proxy = False
if use_proxy:
SOCKS5_PROXY_HOST = '127.0.0.1'
SOCKS5_PROXY_PORT = 1080
default_socket = socket.socket
socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
socket.socket = socks.socksocket
class arxiv_paper():
def __init__(self, arxiv_id = None, paper_info = None):
self.arxiv_id = arxiv_id
self.info = paper_info
def add_author(self, author):
self.info['authors'].append(authors)
def title(self):
return self.info['title']
def describe(self):
information = ''
information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id)
for key in self.info:
if self.info[key] is not None:
info = utils.formal_text(self.info[key])
information += ('\t' + key + ':' + str(info) + '\n')
return information
def show(self):
print(self.describe())
def to_html(self):
dom_tree = dom_node(name = 'paper-section')
paper_title = None
paper_link = None
paper_authors = None
paper_comments = None
paper_subjects = None
paper_abstract = None
for key in self.info:
if self.info[key] is not None:
if key == 'title':
paper_title = dom_node('paper-title')
link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)}
link_node = dom_node('a', link_attr)
link_node.data = self.info[key]
paper_title.add_child(link_node)
paper_link = dom_node('paper-pdf-link')
pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)}
pdf_link = dom_node('a', pdf_link_attr)
pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id)
paper_link.add_child(pdf_link)
elif key == 'authors':
paper_authors = dom_node('paper-authors')
authors_string = ''
for author in self.info[key]:
authors_string += author + ', '
authors_string = authors_string[:-2]
paper_authors.data = authors_string
elif key == 'comments':
paper_comments = dom_node('paper-comments')
paper_comments.data = self.info[key]
elif key == 'subjects':
paper_subjects = dom_node('paper-subjects')
paper_subjects.data = self.info[key]
elif key == 'abstract':
paper_abstract = dom_node('paper-abstract')
paper_abstract.data = self.info[key]
dom_tree.add_child(paper_title)
dom_tree.add_child(paper_link)
dom_tree.add_child(paper_authors)
dom_tree.add_child(paper_abstract)
dom_tree.add_child(paper_comments)
dom_tree.add_child(paper_subjects)
html = dom_tree.to_string()
return html
def download_abstract(self, forcemode=False):
if not forcemode:
if self.info['abstract'] is not None:
# print('skipping download abstract since already downloaded')
return;
r = requests.get('https://arxiv.org/abs/' + self.arxiv_id)
parser = simple_parser()
parser.feed(r.text)
tree = parser.root
meta_nodes = tree.search('meta')
for meta_node in meta_nodes:
meta_attr = meta_node.attributes
if 'property' in meta_attr:
if meta_attr['property'] == 'og:description':
self.info['abstract'] = utils.formal_text(meta_attr['content'])
return;
class arxiv_list_parser():
def __init__(self, html_page):
self.html_page = html_page
self.parser = simple_parser()
self.parser.feed(html_page)
self.tree = self.parser.root
def get_arxiv_id(self, dt_node):
if len(dt_node.childs) == 0:
return None
else:
arxiv_id = dt_node.childs[1].childs[0].attributes['href']
arxiv_id = arxiv_id.split('/')[-1]
return arxiv_id
def get_paper_info(self, dd_node):
title = None
authors = []
comments = None
subjects = None
if len(dd_node.childs) == 0:
return None
else:
elements = dd_node.childs[0].childs
for element in elements:
if 'class' in element.attributes:
element_class = element.attributes['class']
if element_class == 'list-title mathjax':
title = utils.formal_text(element.data)
elif element_class == 'list-authors':
for child in element.childs:
if child.name == 'a':
authors.append(utils.formal_text(child.data))
elif element_class == 'list-comments mathjax':
comments = utils.formal_text(element.data)
elif element_class == 'list-subjects':
subjects = utils.formal_text(element.data)
paper_info = {
'title':title,
'authors':authors,
'comments':comments,
'subjects':subjects,
'abstract':None
}
return paper_info
def get_papers(self):
dts = self.tree.search('dt')
dds = self.tree.search('dd')
papers = []
for dt, dd in zip(dts, dds):
arxiv_id = self.get_arxiv_id(dt)
if arxiv_id == None:
continue;
paper_info = self.get_paper_info(dd)
if paper_info == None:
continue;
paper = arxiv_paper(arxiv_id, paper_info)
papers.append(paper)
return papers
def get_paper_num(self):
totally_paper_node = self.tree.search('small')[0].data
total_num_split = totally_paper_node.split(' ')
num_total = 0
for split in total_num_split:
if split.isdigit():
num_total = int(split)
break;
return num_total
def get_recent_info(self):
# get each day start id and day_name
day_name = []
day_start = []
li_nodes = self.tree.search('ul')[0].childs
for li in li_nodes:
link = li.childs[0].attributes['href']
start = None
if link.find('#item') != -1:
start = link.split('#')[-1][4:]
else:
start = link.split('=')[-2].split('&')[0]
day_name.append(li.childs[0].data)
day_start.append(int(start))
# get total paper num
num_total = self.get_paper_num()
# get each day num.
num_days = len(day_start)
day_num = []
for i in range(num_days):
if i < num_days - 1:
day_num.append(day_start[i+1] - day_start[i])
else:
day_num.append(num_total - day_start[i])
# generate final info.
recent_papers_info = {}
for day, start, num in zip(day_name, day_start, day_num):
current_day_info = {}
current_day_info['start'] = start
current_day_info['num'] = num
recent_papers_info[day] = current_day_info
return recent_papers_info
class arxiv_spider():
def __init__(self, topic, arxiv_url = 'https://arxiv.org'):
self.link = arxiv_url
self.topic = topic
self.base_url = self.link + '/list/' + self.topic
def get_yearly_papers(self, year, log=False):
yearly_url = self.base_url + '/' + year
if log:
print('visiting url [{0}] for basic information'.format(yearly_url))
r = requests.get(yearly_url)
list_parser = arxiv_list_parser(r.text)
total_num = list_parser.get_paper_num()
print('Total Number for this year:', total_num)
yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num)
if log:
print('visiting url [{0}] for all papers'.format(yearly_url_all))
r = requests.get(yearly_url_all)
list_parser = arxiv_list_parser(r.text)
yearly_papers = list_parser.get_papers()
return yearly_papers
# papers:
# papers = {
# 'key is day string': [content is a list of arxiv_paper class]
# }
def get_papers_on_search_list(self, search_url, log=True):
if log:
print('visiting url [{0}] for today papers.'.format(search_url))
search_content = requests.get(search_url)
search_content = search_content.text
parser = simple_parser()
parser.feed(search_content)
tree = parser.root
paper_nodes = tree.search('entry')
print('num_searched_nodes:', len(paper_nodes))
papers = []
for node in paper_nodes:
arxiv_id = node.search('id')[0].data.split('/')[-1]
title = node.search('title')[0].data
author_nodes = node.search('name')
authors = [item.data for item in author_nodes]
category_nodes = node.search('category')
categories = [item.attributes['term'] for item in category_nodes]
subjects = ''
for cat in categories:
subjects += cat + ','
subjects = subjects[:-1]
comments_node = node.search('arxiv:comment')
if len(comments_node) == 0:
comments = ''
else:
comments = node.search('arxiv:comment')[0].data
abstract = node.search('summary')[0].data
title = utils.formal_text(title)
subjects = utils.formal_text(subjects)
comments = utils.formal_text(comments)
abstract = utils.formal_text(abstract)
paper_info = {
'title':title,
'authors':authors,
'comments':comments,
'subjects':subjects,
'abstract':abstract
}
paper = arxiv_paper(arxiv_id, paper_info)
papers.append(paper)
return papers
def get_papers_by_ids(self, ids, log=True):
num_groups = int((len(ids) + 9.1)/10)
if log:
print('spliting into {0} groups.'.format(num_groups))
papers = []
for i in range(num_groups):
this_batch = ids[i * 10:(i+1)*10]
id_list = ''
for paper_id in this_batch:
id_list += paper_id + ','
id_list = id_list[:-1]
search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list
batch_papers = self.get_papers_on_search_list(search_url, log)
papers += batch_papers
return papers
def get_today_ids(self, log=True):
rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic)
if log:
print('visiting url [{0}] for today papers id.'.format(rss_url))
rss_content = requests.get(rss_url)
rss_content = rss_content.text
parser = simple_parser()
parser.feed(rss_content)
rss = parser.root
id_nodes = rss.search('rdf:li')
paper_ids = []
for node in id_nodes:
paper_link = node.attributes['rdf:resource']
paper_id = paper_link.split('/')[-1]
paper_ids.append(paper_id)
print('num_paper_ids:', len(paper_ids))
return paper_ids
def get_today_paper(self, return_day_name=False, log=True):
today_ids = self.get_today_ids(log)
papers = self.get_papers_by_ids(today_ids)
print('num of papers:', len(papers))
return papers
def get_today_paper_backup(self, return_day_name=False):
papers = self.get_recent_papers(recent_days=[1])
today = None
paper = None
for day in papers:
today = day
paper = papers[day]
if return_day_name:
return paper, today
else:
return paper
def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False):
recent_url = self.base_url + '/recent'
if log:
print('visiting url [{0}] for basic information'.format(recent_url))
r = requests.get(recent_url)
list_parser = arxiv_list_parser(r.text)
recent_papers_info = list_parser.get_recent_info()
print('paper info:', recent_papers_info)
day_id = 1
papers = {}
for day in recent_papers_info:
if day_id in recent_days:
today_start = recent_papers_info[day]['start']
today_num = recent_papers_info[day]['num']
page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num)
day_url = self.base_url + page_url
if log:
print('visiting url [{0}] for paper on day {1}'.format(day_url, day))
r = requests.get(day_url)
list_parser = arxiv_list_parser(r.text)
today_papers = list_parser.get_papers()
papers[day] = today_papers
day_id += 1
return papers