86 lines
2.6 KiB
Python
86 lines
2.6 KiB
Python
import arxiv_spider
|
|
import os
|
|
import time
|
|
from lib import utils
|
|
|
|
# cache tree:
|
|
# cache_root
|
|
# - topic-caches
|
|
# - feed_$(time).arxiv_feed
|
|
# - feed_year_$(year).arxiv_feed
|
|
|
|
class arxiv_bot():
|
|
def __init__(self, topics, cache_dir='./cache', arxiv_site='https://arxiv.org', log=False):
|
|
self.log = log
|
|
self.site = arxiv_site
|
|
self.topics = []
|
|
self.spiders = {}
|
|
self.cache_dir = cache_dir
|
|
self.topic_caches = {}
|
|
if not os.path.isdir(self.cache_dir):
|
|
os.makedirs(self.cache_dir)
|
|
self.update_topics(topics)
|
|
|
|
def update_topics(self, topics):
|
|
for topic in topics:
|
|
if topic not in self.topics:
|
|
self.topics.append(topic)
|
|
if self.log:
|
|
print('Adding topic {0}.'.format(topic))
|
|
topic_cache = os.path.join(self.cache_dir, topic)
|
|
self.topic_caches[topic] = topic_cache
|
|
if not os.path.isdir(topic_cache):
|
|
if self.log:
|
|
print('creating topic dir:', topic_cache)
|
|
os.makedirs(topic_cache)
|
|
self.spiders[topic] = arxiv_spider.arxiv_spider(topic, self.site)
|
|
|
|
# load feed if it is already downloaded. If not, use spiders to get today's feed.
|
|
def get_today_feed(self):
|
|
today_feed = {}
|
|
today = utils.str_day()
|
|
for topic in self.topics:
|
|
today_feed_name = 'feed_' + today + '.arxiv_daily_feed'
|
|
today_feed_path = os.path.join(self.cache_dir, topic, today_feed_name)
|
|
cache_dir = self.topic_caches[topic]
|
|
topic_feed = None
|
|
if os.path.exists(today_feed_path):
|
|
topic_feed = utils.load_python_object(today_feed_path)
|
|
else:
|
|
topic_feed = self.spiders[topic].get_today_paper()
|
|
print('Fetching topic {0} papers...'.format(topic))
|
|
for paper in topic_feed:
|
|
if self.log:
|
|
print('download abstract for paper', paper.info['title'])
|
|
paper.download_abstract()
|
|
utils.save_python_object(topic_feed, today_feed_path)
|
|
today_feed[topic] = topic_feed
|
|
return today_feed
|
|
|
|
def get_interested_paper(self, topic, keywords):
|
|
if self.today_feed is None or utils.str_day() is not self.today:
|
|
self.today_feed = self.get_today_feed()
|
|
self.today = utils.str_day()
|
|
print('Updating daily feed.')
|
|
|
|
topic_feed = self.today_feed[topic]
|
|
topic_papers = []
|
|
for day in topic_feed:
|
|
topic_papers += topic_feed[day]
|
|
strong = []
|
|
weak = []
|
|
for paper in topic_papers:
|
|
strong_match = False
|
|
weak_match = False
|
|
for keyword in keywords:
|
|
if paper.info['title'].lower().find(keyword) != -1:
|
|
strong_match = True
|
|
break
|
|
elif paper.info['abstract'].lower().find(keyword) != -1:
|
|
weak_match = True
|
|
if strong_match:
|
|
strong.append(paper)
|
|
elif weak_match:
|
|
weak.append(paper)
|
|
return strong, weak
|