import arxiv_spider import os import time from lib import utils # cache tree: # cache_root # - topic-caches # - feed_$(time).arxiv_feed # - feed_year_$(year).arxiv_feed class arxiv_bot(): def __init__(self, topics, cache_dir='./cache', arxiv_site='https://arxiv.org', log=False): self.log = log self.site = arxiv_site self.topics = [] self.spiders = {} self.cache_dir = cache_dir self.topic_caches = {} if not os.path.isdir(self.cache_dir): os.makedirs(self.cache_dir) self.update_topics(topics) def update_topics(self, topics): for topic in topics: if topic not in self.topics: self.topics.append(topic) if self.log: print('Adding topic {0}.'.format(topic)) topic_cache = os.path.join(self.cache_dir, topic) self.topic_caches[topic] = topic_cache if not os.path.isdir(topic_cache): if self.log: print('creating topic dir:', topic_cache) os.makedirs(topic_cache) self.spiders[topic] = arxiv_spider.arxiv_spider(topic, self.site) # load feed if it is already downloaded. If not, use spiders to get today's feed. def get_today_feed(self): today_feed = {} today = utils.str_day() for topic in self.topics: today_feed_name = 'feed_' + today + '.arxiv_daily_feed' today_feed_path = os.path.join(self.cache_dir, topic, today_feed_name) cache_dir = self.topic_caches[topic] topic_feed = None if os.path.exists(today_feed_path): topic_feed = utils.load_python_object(today_feed_path) else: topic_feed = self.spiders[topic].get_today_paper() print('Fetching topic {0} papers...'.format(topic)) for paper in topic_feed: if self.log: print('download abstract for paper', paper.info['title']) paper.download_abstract() utils.save_python_object(topic_feed, today_feed_path) today_feed[topic] = topic_feed return today_feed def get_interested_paper(self, topic, keywords): if self.today_feed is None or utils.str_day() is not self.today: self.today_feed = self.get_today_feed() self.today = utils.str_day() print('Updating daily feed.') topic_feed = self.today_feed[topic] topic_papers = [] for day in topic_feed: topic_papers += topic_feed[day] strong = [] weak = [] for paper in topic_papers: strong_match = False weak_match = False for keyword in keywords: if paper.info['title'].lower().find(keyword) != -1: strong_match = True break elif paper.info['abstract'].lower().find(keyword) != -1: weak_match = True if strong_match: strong.append(paper) elif weak_match: weak.append(paper) return strong, weak