diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fbd985f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.pyc
+__pycache__/
+cache/
+feeds/
+config/email_session.xml
+config/subscriber.xml
+config.py
\ No newline at end of file
diff --git a/README.md b/README.md
index eec0658..52836d4 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,22 @@
-# ArxivRobot
+# What is This?
+This is a naive and simple arxiv robot, it will fetch today's updated papers from arxiv in specified topic, filter the papers by given keywords, and send the result to a given email address.
+# What i need to do to run this code?
+
+## package requirements:
+It seems you only need to install croniter: ```pip install croniter``` will do it.
+
+## configuration:
+
+1. create config.py, a sample is given in config-examples.py
+2. create config/subscriber.xml, a sample is also given in /config/subscriber_example.xml
+
+## run this code.
+```python main.py```
+
+If everything goes okay, you will see a shell interface, type help for more information.
+
+# PS
+I am really really a bad coder and not good at writing document and comments. If you have anything in trouble, feel free to open a issue and i will try my best to fix the problem.
+
+The code is pushed in a hurry, i will add a document to explain this code when i have free time.
diff --git a/analysis_paper.py b/analysis_paper.py
new file mode 100644
index 0000000..e8a5f9f
--- /dev/null
+++ b/analysis_paper.py
@@ -0,0 +1,65 @@
+from arxiv_spider import arxiv_paper
+import utils
+import numpy as np
+
+authors = {}
+
+years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
+
+for year in years:
+ print('Analysising year:', year)
+ papers = utils.load_python_object('./feeds/' + year)
+ for paper in papers:
+ author_this_paper = paper.info['authors']
+ for author in author_this_paper:
+ author = utils.delete_n(author)
+ if author in authors:
+ authors[author] += 1
+ else:
+ authors[author] = 1
+
+freq = []
+names = []
+for author in authors:
+ freq.append(authors[author])
+ names.append(author)
+
+freq = np.asarray(freq, dtype=np.int32)
+
+freq_sort = np.argsort(freq)
+
+num_authors = len(names)
+
+for i in range(num_authors):
+ aid = freq_sort[num_authors - i - 1]
+ print('Name: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8'))
+
+
+# keywords = {}
+
+# years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
+
+# for year in years:
+# print('Analysising year:', year)
+# papers = utils.load_python_object('./feeds/' + year)
+# for paper in papers:
+# keyword_this_paper = paper.info['title'].split(' ')
+# for keyword in keyword_this_paper:
+# keyword = utils.delete_n(keyword).lower()
+# if keyword in keywords:
+# keywords[keyword] += 1
+# else:
+# keywords[keyword] = 1
+
+# freq = []
+# names = []
+# for keyword in keywords:
+# freq.append(keywords[keyword])
+# names.append(keyword)
+# freq = np.asarray(freq, dtype=np.int32)
+# freq_sort = np.argsort(freq)
+# num_keywords = len(names)
+
+# for i in range(num_keywords):
+# aid = freq_sort[num_keywords - i - 1]
+# print('Keyword: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8'))
\ No newline at end of file
diff --git a/arxiv_bot.py b/arxiv_bot.py
new file mode 100644
index 0000000..b545c8f
--- /dev/null
+++ b/arxiv_bot.py
@@ -0,0 +1,85 @@
+import arxiv_spider
+import os
+import time
+from lib import utils
+
+# cache tree:
+# cache_root
+# - topic-caches
+# - feed_$(time).arxiv_feed
+# - feed_year_$(year).arxiv_feed
+
+class arxiv_bot():
+ def __init__(self, topics, cache_dir='./cache', arxiv_site='https://arxiv.org', log=False):
+ self.log = log
+ self.site = arxiv_site
+ self.topics = []
+ self.spiders = {}
+ self.cache_dir = cache_dir
+ self.topic_caches = {}
+ if not os.path.isdir(self.cache_dir):
+ os.makedirs(self.cache_dir)
+ self.update_topics(topics)
+
+ def update_topics(self, topics):
+ for topic in topics:
+ if topic not in self.topics:
+ self.topics.append(topic)
+ if self.log:
+ print('Adding topic {0}.'.format(topic))
+ topic_cache = os.path.join(self.cache_dir, topic)
+ self.topic_caches[topic] = topic_cache
+ if not os.path.isdir(topic_cache):
+ if self.log:
+ print('creating topic dir:', topic_cache)
+ os.makedirs(topic_cache)
+ self.spiders[topic] = arxiv_spider.arxiv_spider(topic, self.site)
+
+ # load feed if it is already downloaded. If not, use spiders to get today's feed.
+ def get_today_feed(self):
+ today_feed = {}
+ today = utils.str_day()
+ for topic in self.topics:
+ today_feed_name = 'feed_' + today + '.arxiv_daily_feed'
+ today_feed_path = os.path.join(self.cache_dir, topic, today_feed_name)
+ cache_dir = self.topic_caches[topic]
+ topic_feed = None
+ if os.path.exists(today_feed_path):
+ topic_feed = utils.load_python_object(today_feed_path)
+ else:
+ topic_feed = self.spiders[topic].get_today_paper()
+ print('Fetching topic {0} papers...'.format(topic))
+ for paper in topic_feed:
+ if self.log:
+ print('download abstract for paper', paper.info['title'])
+ paper.download_abstract()
+ utils.save_python_object(topic_feed, today_feed_path)
+ today_feed[topic] = topic_feed
+ return today_feed
+
+ def get_interested_paper(self, topic, keywords):
+ if self.today_feed is None or utils.str_day() is not self.today:
+ self.today_feed = self.get_today_feed()
+ self.today = utils.str_day()
+ print('Updating daily feed.')
+
+ topic_feed = self.today_feed[topic]
+ topic_papers = []
+ for day in topic_feed:
+ topic_papers += topic_feed[day]
+ strong = []
+ weak = []
+ for paper in topic_papers:
+ strong_match = False
+ weak_match = False
+ for keyword in keywords:
+ if paper.info['title'].lower().find(keyword) != -1:
+ strong_match = True
+ break
+ elif paper.info['abstract'].lower().find(keyword) != -1:
+ weak_match = True
+ if strong_match:
+ strong.append(paper)
+ elif weak_match:
+ weak.append(paper)
+ return strong, weak
diff --git a/arxiv_service.py b/arxiv_service.py
new file mode 100644
index 0000000..c2af06d
--- /dev/null
+++ b/arxiv_service.py
@@ -0,0 +1,31 @@
+import arxiv_bot
+import feeds
+import email_sender
+import time
+import subscriber_utils
+import utils
+from croniter import croniter
+import threading
+
+class test_service():
+ def __init__(self, name):
+ self.name = name
+ pass
+
+ def do(self):
+ print('Job {0} run!'.format(self.name))
+
+class mail_service():
+ def __init__(self, emailer):
+ self.emailer = emailer
+
+ def do(self):
+ self.emailer.send_daily_email()
+
+class reload_subscriber():
+ def __init__(self, subscriber_mgr):
+ self.mgr = subscriber_mgr
+
+ def do(self):
+ self.mgr.load()
+
diff --git a/arxiv_spider.py b/arxiv_spider.py
new file mode 100644
index 0000000..e1ae13d
--- /dev/null
+++ b/arxiv_spider.py
@@ -0,0 +1,369 @@
+import requests
+import pickle
+import time
+from lib import utils
+from lib.parser import dom_node, simple_parser
+
+import socket
+import socks
+
+use_proxy = False
+if use_proxy:
+ SOCKS5_PROXY_HOST = '127.0.0.1'
+ SOCKS5_PROXY_PORT = 1080
+ default_socket = socket.socket
+ socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
+ socket.socket = socks.socksocket
+
+class arxiv_paper():
+ def __init__(self, arxiv_id = None, paper_info = None):
+ self.arxiv_id = arxiv_id
+ self.info = paper_info
+
+ def add_author(self, author):
+ self.info['authors'].append(authors)
+
+ def title(self):
+ return self.info['title']
+
+
+ def describe(self):
+ information = ''
+ information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id)
+ for key in self.info:
+ if self.info[key] is not None:
+ info = utils.formal_text(self.info[key])
+ information += ('\t' + key + ':' + str(info) + '\n')
+ return information
+
+ def show(self):
+ print(self.describe())
+
+ def to_html(self):
+ dom_tree = dom_node(name = 'paper-section')
+ paper_title = None
+ paper_link = None
+ paper_authors = None
+ paper_comments = None
+ paper_subjects = None
+ paper_abstract = None
+ for key in self.info:
+ if self.info[key] is not None:
+ if key == 'title':
+ paper_title = dom_node('paper-title')
+ link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)}
+ link_node = dom_node('a', link_attr)
+ link_node.data = self.info[key]
+ paper_title.add_child(link_node)
+ paper_link = dom_node('paper-pdf-link')
+ pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)}
+ pdf_link = dom_node('a', pdf_link_attr)
+ pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id)
+ paper_link.add_child(pdf_link)
+
+ elif key == 'authors':
+ paper_authors = dom_node('paper-authors')
+ authors_string = ''
+ for author in self.info[key]:
+ authors_string += author + ', '
+ authors_string = authors_string[:-2]
+ paper_authors.data = authors_string
+
+ elif key == 'comments':
+ paper_comments = dom_node('paper-comments')
+ paper_comments.data = self.info[key]
+
+ elif key == 'subjects':
+ paper_subjects = dom_node('paper-subjects')
+ paper_subjects.data = self.info[key]
+
+ elif key == 'abstract':
+ paper_abstract = dom_node('paper-abstract')
+ paper_abstract.data = self.info[key]
+ dom_tree.add_child(paper_title)
+ dom_tree.add_child(paper_link)
+ dom_tree.add_child(paper_authors)
+ dom_tree.add_child(paper_abstract)
+ dom_tree.add_child(paper_comments)
+ dom_tree.add_child(paper_subjects)
+ html = dom_tree.to_string()
+ return html
+
+ def download_abstract(self, forcemode=False):
+ if not forcemode:
+ if self.info['abstract'] is not None:
+ # print('skipping download abstract since already downloaded')
+ return;
+ r = requests.get('https://arxiv.org/abs/' + self.arxiv_id)
+ parser = simple_parser()
+ parser.feed(r.text)
+ tree = parser.root
+ meta_nodes = tree.search('meta')
+ for meta_node in meta_nodes:
+ meta_attr = meta_node.attributes
+ if 'property' in meta_attr:
+ if meta_attr['property'] == 'og:description':
+ self.info['abstract'] = utils.formal_text(meta_attr['content'])
+ return;
+
+class arxiv_list_parser():
+ def __init__(self, html_page):
+ self.html_page = html_page
+ self.parser = simple_parser()
+ self.parser.feed(html_page)
+ self.tree = self.parser.root
+
+ def get_arxiv_id(self, dt_node):
+ if len(dt_node.childs) == 0:
+ return None
+ else:
+ arxiv_id = dt_node.childs[1].childs[0].attributes['href']
+ arxiv_id = arxiv_id.split('/')[-1]
+ return arxiv_id
+
+ def get_paper_info(self, dd_node):
+ title = None
+ authors = []
+ comments = None
+ subjects = None
+ if len(dd_node.childs) == 0:
+ return None
+ else:
+ elements = dd_node.childs[0].childs
+ for element in elements:
+ if 'class' in element.attributes:
+ element_class = element.attributes['class']
+ if element_class == 'list-title mathjax':
+ title = utils.formal_text(element.data)
+ elif element_class == 'list-authors':
+ for child in element.childs:
+ if child.name == 'a':
+ authors.append(utils.formal_text(child.data))
+ elif element_class == 'list-comments mathjax':
+ comments = utils.formal_text(element.data)
+ elif element_class == 'list-subjects':
+ subjects = utils.formal_text(element.data)
+ paper_info = {
+ 'title':title,
+ 'authors':authors,
+ 'comments':comments,
+ 'subjects':subjects,
+ 'abstract':None
+ }
+ return paper_info
+
+ def get_papers(self):
+ dts = self.tree.search('dt')
+ dds = self.tree.search('dd')
+ papers = []
+ for dt, dd in zip(dts, dds):
+ arxiv_id = self.get_arxiv_id(dt)
+ if arxiv_id == None:
+ continue;
+ paper_info = self.get_paper_info(dd)
+ if paper_info == None:
+ continue;
+ paper = arxiv_paper(arxiv_id, paper_info)
+ papers.append(paper)
+ return papers
+
+ def get_paper_num(self):
+ totally_paper_node = self.tree.search('small')[0].data
+ total_num_split = totally_paper_node.split(' ')
+ num_total = 0
+ for split in total_num_split:
+ if split.isdigit():
+ num_total = int(split)
+ break;
+ return num_total
+
+ def get_recent_info(self):
+ # get each day start id and day_name
+ day_name = []
+ day_start = []
+ li_nodes = self.tree.search('ul')[0].childs
+ for li in li_nodes:
+ link = li.childs[0].attributes['href']
+ start = None
+ if link.find('#item') != -1:
+ start = link.split('#')[-1][4:]
+ else:
+ start = link.split('=')[-2].split('&')[0]
+ day_name.append(li.childs[0].data)
+ day_start.append(int(start))
+ # get total paper num
+ num_total = self.get_paper_num()
+ # get each day num.
+ num_days = len(day_start)
+ day_num = []
+ for i in range(num_days):
+ if i < num_days - 1:
+ day_num.append(day_start[i+1] - day_start[i])
+ else:
+ day_num.append(num_total - day_start[i])
+
+ # generate final info.
+ recent_papers_info = {}
+ for day, start, num in zip(day_name, day_start, day_num):
+ current_day_info = {}
+ current_day_info['start'] = start
+ current_day_info['num'] = num
+ recent_papers_info[day] = current_day_info
+ return recent_papers_info
+
+class arxiv_spider():
+ def __init__(self, topic, arxiv_url = 'https://arxiv.org'):
+ self.link = arxiv_url
+ self.topic = topic
+ self.base_url = self.link + '/list/' + self.topic
+
+
+ def get_yearly_papers(self, year, log=False):
+ yearly_url = self.base_url + '/' + year
+ if log:
+ print('visiting url [{0}] for basic information'.format(yearly_url))
+ r = requests.get(yearly_url)
+ list_parser = arxiv_list_parser(r.text)
+ total_num = list_parser.get_paper_num()
+ print('Total Number for this year:', total_num)
+ yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num)
+ if log:
+ print('visiting url [{0}] for all papers'.format(yearly_url_all))
+ r = requests.get(yearly_url_all)
+ list_parser = arxiv_list_parser(r.text)
+ yearly_papers = list_parser.get_papers()
+ return yearly_papers
+
+ # papers:
+ # papers = {
+ # 'key is day string': [content is a list of arxiv_paper class]
+ # }
+
+ def get_papers_on_search_list(self, search_url, log=True):
+ if log:
+ print('visiting url [{0}] for today papers.'.format(search_url))
+ search_content = requests.get(search_url)
+ search_content = search_content.text
+ parser = simple_parser()
+ parser.feed(search_content)
+ tree = parser.root
+ paper_nodes = tree.search('entry')
+ print('num_searched_nodes:', len(paper_nodes))
+ papers = []
+ for node in paper_nodes:
+ arxiv_id = node.search('id')[0].data.split('/')[-1]
+ title = node.search('title')[0].data
+ author_nodes = node.search('name')
+ authors = [item.data for item in author_nodes]
+ category_nodes = node.search('category')
+ categories = [item.attributes['term'] for item in category_nodes]
+ subjects = ''
+ for cat in categories:
+ subjects += cat + ','
+ subjects = subjects[:-1]
+ comments_node = node.search('arxiv:comment')
+ if len(comments_node) == 0:
+ comments = ''
+ else:
+ comments = node.search('arxiv:comment')[0].data
+ abstract = node.search('summary')[0].data
+
+ title = utils.formal_text(title)
+ subjects = utils.formal_text(subjects)
+ comments = utils.formal_text(comments)
+ abstract = utils.formal_text(abstract)
+
+
+ paper_info = {
+ 'title':title,
+ 'authors':authors,
+ 'comments':comments,
+ 'subjects':subjects,
+ 'abstract':abstract
+ }
+
+ paper = arxiv_paper(arxiv_id, paper_info)
+ papers.append(paper)
+ return papers
+
+ def get_papers_by_ids(self, ids, log=True):
+ num_groups = int((len(ids) + 9.1)/10)
+ if log:
+ print('spliting into {0} groups.'.format(num_groups))
+ papers = []
+ for i in range(num_groups):
+ this_batch = ids[i * 10:(i+1)*10]
+ id_list = ''
+ for paper_id in this_batch:
+ id_list += paper_id + ','
+ id_list = id_list[:-1]
+ search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list
+ batch_papers = self.get_papers_on_search_list(search_url, log)
+ papers += batch_papers
+ return papers
+
+
+ def get_today_ids(self, log=True):
+ rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic)
+ if log:
+ print('visiting url [{0}] for today papers id.'.format(rss_url))
+ rss_content = requests.get(rss_url)
+ rss_content = rss_content.text
+ parser = simple_parser()
+ parser.feed(rss_content)
+ rss = parser.root
+ id_nodes = rss.search('rdf:li')
+ paper_ids = []
+ for node in id_nodes:
+ paper_link = node.attributes['rdf:resource']
+ paper_id = paper_link.split('/')[-1]
+ paper_ids.append(paper_id)
+ print('num_paper_ids:', len(paper_ids))
+ return paper_ids
+
+ def get_today_paper(self, return_day_name=False, log=True):
+ today_ids = self.get_today_ids(log)
+ papers = self.get_papers_by_ids(today_ids)
+ print('num of papers:', len(papers))
+ return papers
+
+
+
+ def get_today_paper_backup(self, return_day_name=False):
+ papers = self.get_recent_papers(recent_days=[1])
+ today = None
+ paper = None
+ for day in papers:
+ today = day
+ paper = papers[day]
+ if return_day_name:
+ return paper, today
+ else:
+ return paper
+
+
+ def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False):
+ recent_url = self.base_url + '/recent'
+ if log:
+ print('visiting url [{0}] for basic information'.format(recent_url))
+ r = requests.get(recent_url)
+ list_parser = arxiv_list_parser(r.text)
+ recent_papers_info = list_parser.get_recent_info()
+ print('paper info:', recent_papers_info)
+
+ day_id = 1
+ papers = {}
+ for day in recent_papers_info:
+ if day_id in recent_days:
+ today_start = recent_papers_info[day]['start']
+ today_num = recent_papers_info[day]['num']
+ page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num)
+ day_url = self.base_url + page_url
+ if log:
+ print('visiting url [{0}] for paper on day {1}'.format(day_url, day))
+ r = requests.get(day_url)
+ list_parser = arxiv_list_parser(r.text)
+ today_papers = list_parser.get_papers()
+ papers[day] = today_papers
+ day_id += 1
+ return papers
diff --git a/config-examples.py b/config-examples.py
new file mode 100644
index 0000000..e617471
--- /dev/null
+++ b/config-examples.py
@@ -0,0 +1,9 @@
+
+#### email related config ####
+username = 'email@email.com' # send email using this account
+password = 'yourpassword' # your email login passoword
+sender_name = 'ArxivRobot' # the name of your robot
+replyto = 'yourmail@mail.com' # all replay email will be foreward to this email address
+
+smtp_ssl_addr = 'smtp.smtp.com'
+# smtp server, only ssl supported. you can support more by editing function send_main in email_sender.py
diff --git a/config/style.css b/config/style.css
new file mode 100644
index 0000000..62e1cdc
--- /dev/null
+++ b/config/style.css
@@ -0,0 +1,109 @@
+
+
+
\ No newline at end of file
diff --git a/config/subscriber_example.xml b/config/subscriber_example.xml
new file mode 100644
index 0000000..09a65e9
--- /dev/null
+++ b/config/subscriber_example.xml
@@ -0,0 +1,27 @@
+
+ name1
+ mail1@mail.com
+
+ cs.CV
+ cs.LG
+ stat.ML
+
+
+ keyword1
+ keyword2
+
+
+
+
+ name2
+ mail2@mail.com
+
+ cs.CV
+ cs.LG
+ stat.ML
+
+
+ keyword1
+ keyword2
+
+
diff --git a/download_html.py b/download_html.py
new file mode 100644
index 0000000..d2a02c2
--- /dev/null
+++ b/download_html.py
@@ -0,0 +1,10 @@
+import requests
+from html.parser import HTMLParser
+
+# r = requests.get('https://arxiv.org/list/cs.CV/recent')
+r = requests.get('https://arxiv.org/list/cs.CV/recent')
+# r = requests.get('http://xxx.itp.ac.cn/list/cs.CV/recent')
+# r = requests.get('https://arxiv.org/list/cs.CV/pastweek?skip=25&show=25')
+
+# print(r.status_code)
+print(r.text)
diff --git a/email_sender.py b/email_sender.py
new file mode 100644
index 0000000..a98afe5
--- /dev/null
+++ b/email_sender.py
@@ -0,0 +1,138 @@
+from lib.parser import dom_node, simple_parser
+from lib import parser
+from lib import utils
+import os
+import config
+
+import smtplib
+import email
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.image import MIMEImage
+from email.mime.base import MIMEBase
+from email.mime.application import MIMEApplication
+from email.header import Header
+from email import generator
+def send_mail(reciver, title, content):
+
+ # with open('email.html', 'w', encoding="utf-8") as f:
+ # f.write(content)
+ # 20:24
+
+ username = config.username
+ password = config.password
+ replyto = config.replyto
+ msg = MIMEMultipart('alternative')
+ msg['Subject'] = Header(title)
+ msg['From'] = '%s <%s>' % (Header(config.sender_name), username)
+ msg['To'] = reciver
+ msg['Reply-to'] = replyto
+ msg['Message-id'] = email.utils.make_msgid()
+ msg['Date'] = email.utils.formatdate()
+ texthtml = MIMEText(content, _subtype='html', _charset='UTF-8')
+ msg.attach(texthtml)
+
+ # with open('email.eml', 'w') as outfile:
+ # gen = generator.Generator(outfile)
+ # gen.flatten(msg)
+
+ try:
+ client = smtplib.SMTP_SSL(config.smtp_ssl_addr)
+ # client.connect('smtpdm-ap-southeast-1.aliyun.com', 80)
+ client.set_debuglevel(0)
+ client.login(username, password)
+ client.sendmail(username, reciver, msg.as_string())
+
+ client.quit()
+ print ('Email send to {0} success!'.format(reciver))
+ return True
+ except smtplib.SMTPConnectError as e:
+ print ('Connection Error:', e.smtp_code, e.smtp_error)
+ except smtplib.SMTPAuthenticationError as e:
+ print ('Authentication Error:', e.smtp_code, e.smtp_error)
+ except smtplib.SMTPSenderRefused as e:
+ print ('Sender Refused:', e.smtp_code, e.smtp_error)
+ except smtplib.SMTPRecipientsRefused as e:
+ print ('SMTPRecipients Refused:', e.smtp_code, e.smtp_error)
+ except smtplib.SMTPDataError as e:
+ print ('Data Error:', e.smtp_code, e.smtp_error)
+ except smtplib.SMTPException as e:
+ print ('SMTPException:', e.message)
+ except Exception as e:
+ print ('Unknown error:', str(e))
+ return True
+
+class arxiv_emailer():
+ def __init__(self, arxiv_bot, feeds_generator, session_file = './config/email_session.xml', debug=False):
+ self.debug = debug
+ self.email_info = dom_node()
+ self.session_file = session_file
+ self.sessions = None
+ if self.session_file is not None:
+ self.load_session()
+
+ self.bot = arxiv_bot
+ self.feeds = feeds_generator
+
+ def send_daily_email(self):
+ emails = self.feeds.generate_daily_emails()
+ today = utils.str_day()
+ for name in emails:
+ email = emails[name]
+ send = False
+ if name not in self.sessions:
+ print('New user found!')
+ self.sessions[name] = {}
+ self.sessions[name]['last-send'] = today
+ send = True
+
+ if self.sessions[name]['last-send'] != today:
+ send = True
+
+ if send:
+ print('Sending email to user {0} [{1}]'.format(name, email['reciver']))
+ print('reciver:', email['reciver'])
+ print('title:', email['title'])
+ print('content:', len(email['content']))
+ success = False
+ if not self.debug:
+ success = send_mail(email['reciver'], email['title'], email['content'])
+ if success:
+ self.sessions[name]['last-send'] = today
+ self.save_session()
+ else:
+ print('skipping user {0} since already sent!'.format(name))
+
+ def load_session(self, session_file=None):
+ if session_file is None:
+ session_file = self.session_file
+ tree = None
+ with open(session_file, 'r') as f:
+ xml = f.read()
+ xmlparser = simple_parser()
+ xmlparser.feed(xml)
+ tree = xmlparser.root
+ sessions = parser.dom2dict(tree)
+ if 'root' in sessions:
+ sessions = sessions['root']
+ else:
+ sessions = {}
+ self.sessions = sessions
+ print(self.sessions)
+ return sessions
+
+ def save_session(self, session_file=None):
+ if session_file is None:
+ session_file = self.session_file
+ if session_file is None:
+ return None
+ xml = parser.dict2dom(self.sessions).to_string()
+ with open(session_file, 'w') as f:
+ f.write(xml)
+ return xml
+
+if __name__ == '__main__':
+ emailer = arxiv_emailer(None, None, None)
+ emailer.send_daily_email()
+ print(emailer.load_session())
+ print(emailer.save_session())
diff --git a/feeds.py b/feeds.py
new file mode 100644
index 0000000..4e3c450
--- /dev/null
+++ b/feeds.py
@@ -0,0 +1,103 @@
+from lib.parser import dom_node
+from lib import utils
+
+
+class feed_manager():
+ def __init__(self, submgr, arxivbot, style='./config/style.css'):
+ self.style_path = style
+ self.style = ''
+ self.bot = arxivbot
+ self.submgr = submgr
+ self.update_style()
+
+ def update_style(self, path = None):
+ if path is None:
+ path = self.style_path
+ print('loading style from:', path)
+ with open(path, 'r') as f:
+ self.style = f.read()
+ self.style += '\n'
+
+ def fetch_today_feed(self):
+ self.today_feed = self.bot.get_today_feed()
+
+ def filter_papers_for_user(self, subscriber):
+ strong_papers = []
+ weak_papers = []
+ keywords = subscriber['keywords']
+ papers = []
+ for topic in subscriber['topics']:
+ if topic in self.today_feed:
+ papers += self.today_feed[topic]
+ else:
+ print('Warning: topic {0} is subscribed but not downloaded!'.format(topic))
+ known_ids = []
+ unique_papers = []
+ for paper in papers:
+ paper_id = paper.arxiv_id
+ if paper_id not in known_ids:
+ unique_papers.append(paper)
+ known_ids.append(paper_id)
+ print('removing {0} repeated papers.'.format(len(papers) - len(unique_papers)))
+ papers = unique_papers
+ for paper in papers:
+ strong = False
+ weak = False
+ for keyword in keywords:
+ if paper.info['title'].lower().find(keyword) != -1:
+ strong = True
+ break;
+ elif paper.info['abstract'].lower().find(keyword) != -1:
+ weak = True
+ if strong:
+ strong_papers.append(paper)
+ elif weak:
+ weak_papers.append(paper)
+ return strong_papers, weak_papers
+
+ def generate_group_feed(self, paper_groups):
+ group_html = ''
+ for key in paper_groups:
+ header = dom_node('paper-group')
+ header.data = key
+ group_html += header.to_string() + '\n'
+ for paper in paper_groups[key]:
+ group_html += paper.to_html() + '\n'
+ return group_html
+
+ def generate_daily_feed_by_matched_paper(self, strong_interested, weak_interested):
+ feeds = {}
+ if len(strong_interested) > 0:
+ feeds['Strong Interested Paper'] = strong_interested
+ if len(weak_interested) > 0:
+ feeds['Weak Interested Paper'] = weak_interested
+ xml_feed = self.generate_group_feed(feeds)
+ return xml_feed
+
+ def generate_daily_email_by_matched_paper(self, strong_interested, weak_interested):
+ xml_feed = self.generate_daily_feed_by_matched_paper(strong_interested, weak_interested)
+ email_content = ''
+ if xml_feed != '':
+ email_content = self.style + xml_feed
+ return email_content
+
+ def generate_daily_emails(self):
+ self.fetch_today_feed()
+ emails = {}
+ # email is a dict, containing title, reciver and content.
+ today = utils.str_day()
+ for name in self.submgr.subscribers:
+ subscriber = self.submgr.subscribers[name]
+ strong, weak = self.filter_papers_for_user(subscriber)
+ content = self.generate_daily_email_by_matched_paper(strong, weak)
+ reciver = subscriber['email']
+ if content == '':
+ print('Skipping user {0} [{1}] since no paper matched.'.format(name, reciver))
+ continue;
+ title = "Your Interested Paper On Arxiv Today ({0})".format(today)
+ email = {}
+ email['reciver'] = reciver
+ email['title'] = title
+ email['content'] = content
+ emails[name] = email
+ return emails
diff --git a/lib/console.py b/lib/console.py
new file mode 100644
index 0000000..3ee7203
--- /dev/null
+++ b/lib/console.py
@@ -0,0 +1,304 @@
+from . import utils
+import os
+import traceback
+
+class console():
+ def __init__(self, name='base'):
+ self.name = name
+ self.hint = '$ '
+ self.exit_cmd = ['exit', 'quit', 'bye']
+ self.exit_info = 'Bye~'
+ self.commands = {}
+ self.alias = {}
+ self.warn_level = 4
+ self.exit_flag = False
+ self.debug = True
+ self.platform = utils.detect_platform()
+ self.is_child = False
+ self.father = None
+
+ self.regist_internal_command()
+
+ def get_hint(self):
+ if self.platform == 'Linux':
+ hint = '\033[0;33m({0})\033[0;31m{1}\033[0m'.format(self.name, self.hint)
+ else:
+ hint = '({0}){1}'.format(self.name, self.hint)
+ return hint
+
+ def regist_internal_command(self):
+ self.regist(
+ 'help',
+ action=self.command_help,
+ alias=['h'],
+ help_info='display this help info.',
+ kind='sys'
+ )
+ self.regist(
+ 'exit',
+ action=self.command_exit_console,
+ alias=['quit','bye'],
+ help_info='exit current console.',
+ kind='sys'
+ )
+ self.regist(
+ 'cls',
+ action=self.command_clear_screen,
+ alias=['clear', 'clc'],
+ help_info='clear screen.',
+ kind='sys'
+ )
+ self.regist(
+ 'alias',
+ action=self.command_alias,
+ help_info='display alias info or create new alias.',
+ kind='sys'
+ )
+ self.regist(
+ 'os',
+ action=self.command_os,
+ help_info='run a system command.',
+ kind='sys'
+ )
+
+
+ def translate_command(self, command):
+ while command in self.alias and command not in self.commands:
+ command = self.alias[command]
+ return command
+
+ def find_equal_command(self, command, ret_type = str, ignored = []):
+ finished = []
+ new = []
+
+ cmds = [command]
+ while len(finished) != len(cmds):
+ # find child
+ if command in self.alias:
+ if self.alias[command] not in cmds:
+ cmds.append(self.alias[command])
+ # find fathers
+ for al in self.alias:
+ if self.alias[al] == command:
+ if al not in cmds:
+ cmds.append(al)
+ # found finished.
+ finished.append(command)
+ for cmd in cmds:
+ if cmd not in finished:
+ command = cmd
+
+
+ if ret_type is str:
+ finished = utils.list2csv(finished)
+ return finished
+
+
+
+ def get_alias(self, command, ret_type=str):
+ alias = []
+ for al in self.alias:
+ if self.alias[al] == command:
+ alias.append(al)
+
+ if ret_type is str:
+ alias = utils.list2csv(alias)
+
+ return alias
+
+ def command_exist(self, command):
+ if command in self.commands or command in self.alias:
+ return True
+ else:
+ return False
+
+ def add_alias(self, command, alias):
+ if self.command_exist(alias):
+ if warn_level >= 3:
+ print('Alias {0} will not be added since already used'.format(al))
+ else:
+ self.alias[alias] = command
+
+ # kind: standard or shared
+ # standard: help info will be displayed
+ # shared: help info will not be displayed in sub command.
+ def regist(self, command, action, alias=None, help_info='no help provided.', kind='standard'):
+ if type(action) == console:
+ action.is_child = True
+ action.father = self
+ exist = self.command_exist(command)
+ if exist:
+ if self.warn_level >=3:
+ print('Command {0} will not be added sinece already exist.'.format(command))
+ return
+
+ if type(alias) is list:
+ for al in alias:
+ self.add_alias(command, al)
+ elif type(alias) is str:
+ self.add_alias(command, alias)
+ elif alias is None:
+ pass
+ else:
+ if self.warn_level > 3:
+ print('Unknown alias type, no alias will be added.')
+ self.commands[command] = {}
+ self.commands[command]['action'] = action
+ self.commands[command]['help'] = help_info
+ self.commands[command]['kind'] = kind
+
+ def handle_command(self, command, args):
+ if command in self.commands:
+ act = self.commands[command]['action']
+ try:
+ act(args)
+ except KeyboardInterrupt:
+ pass
+ except:
+ print('Exception occured while processing command \"{0} {1}\".'.format(command, args))
+ print('More information are shown below.\n', traceback.format_exc())
+ else:
+ print('Unknown command \"{0}\"'.format(command))
+
+ # seperate command and its args.
+ def parse_command(self, string):
+ string += ' '
+ length = len(string)
+ command_end = 0
+ parse_start = False
+ for i in range(length):
+ blank = utils.is_blank(string[i])
+ if not blank:
+ parse_start=True
+ if parse_start and blank:
+ command_end = i
+ break
+
+ command = string[:command_end]
+ command = utils.remove_blank_in_endpoint(command)
+ args = utils.remove_blank_in_endpoint(string[command_end:])
+ return command, args
+
+ def parse(self, string):
+ command, args = self.parse_command(string)
+ exitsted_commands = []
+ while command in self.alias:
+ if command not in exitsted_commands:
+ exitsted_commands.append(command)
+ command = self.alias[command]
+ string = command + ' ' + args
+ command, args = self.parse_command(string)
+ else:
+ break
+
+ return command, args
+
+
+ def show_help_info(self, command, prefix, indent, depth=0):
+ command = self.translate_command(command)
+ action = self.commands[command]['action']
+ kind = self.commands[command]['kind']
+ if kind == 'sys' and depth > 0:
+ return
+ alias = self.get_alias(command, ret_type=str)
+ if alias != '':
+ print('{0}{1}({2}):'.format(prefix, command, alias))
+ else:
+ print('{0}{1}:'.format(prefix, command))
+ print('{0}{1}{2}'.format(prefix, indent, self.commands[command]['help']))
+ if type(action) == console:
+ action.command_help('', prefix=prefix+indent, indent=indent, depth=depth+1)
+
+ def debug_log(self, command, args):
+ if self.debug:
+ print('command:[{0}] args:[{1}]'.format(command, args))
+
+ def command_exit_console(self, args):
+ if not self.is_child:
+ print(self.exit_info)
+ self.exit_flag = True
+
+ def command_clear_screen(self, args):
+ if self.platform == 'Windows':
+ os.system('cls')
+ elif self.platform == 'Linux':
+ os.system('clear')
+ return False
+
+ def command_help(self, args, prefix = '', indent=' ', depth=0):
+ command, args = self.parse_command(args)
+ if command is not "":
+ if self.command_exist(command):
+ self.show_help_info(command, prefix, indent, depth)
+ else:
+ print('Unknown command \"{0}\"'.format(command))
+ else:
+ for command in self.commands:
+ self.show_help_info(command, prefix, indent, depth)
+
+ def command_alias(self, args):
+ alias_parse = args.split('=')
+ if len(alias_parse) == 2:
+ alias = utils.remove_blank_in_endpoint(alias_parse[0])
+ command = utils.remove_blank_in_endpoint(alias_parse[1])
+ if command is not '':
+ self.alias[alias]=command
+ else:
+ del self.alias[alias]
+ elif args == '':
+ for alias in self.alias:
+ print('{0}={1}'.format(alias, self.alias[alias]))
+ elif len(alias_parse) == 1:
+ if args in self.alias:
+ print('{0}={1}'.format(args, self.alias[args]))
+ equal_alias = self.find_equal_command(args)
+ if equal_alias != '':
+ print('Hint: {0} are all equivalent.'.format(equal_alias))
+ elif args in self.commands:
+ als = self.get_alias(args, ret_type=str)
+ if als == '':
+ print('command {0} has no alias.'.format(args))
+ else:
+ print('command {0} is aliased as {1}'.format(args, als))
+ equal_alias = self.find_equal_command(args)
+ if equal_alias != '' and equal_alias != args:
+ print('Hint: {0} are all equivalent.'.format(equal_alias))
+ else:
+ print('No alias \"{0}\" found.'.format(args))
+ else:
+ print('Syntax error, command not understood.')
+
+ def command_os(self, args):
+ if args == '':
+ print('please specify os command')
+ else:
+ os.system(args)
+
+ def execute(self, string):
+ command, args = self.parse(string)
+ if command is not "":
+ self.handle_command(command, args)
+
+ def __call__(self, args):
+ if args != '':
+ self.execute(args)
+ else:
+ self.exit_flag=False
+ self.interactive()
+
+ def interactive(self):
+ while not self.exit_flag:
+ try:
+ input_str = input(self.get_hint())
+ self.execute(input_str)
+ except(KeyboardInterrupt):
+ print('')
+
+
+if __name__ == '__main__':
+ con = console()
+ con_sub = console()
+ con_sub_sub = console()
+ con_sub.regist('test_subsubcommand', con_sub_sub, alias='tss', help_info='A sub command.')
+ con.regist('test_subcommand', con_sub, alias='ts', help_info='A sub command.')
+ con.interactive()
\ No newline at end of file
diff --git a/lib/parallel.py b/lib/parallel.py
new file mode 100644
index 0000000..1b1fe74
--- /dev/null
+++ b/lib/parallel.py
@@ -0,0 +1,127 @@
+import threading
+import queue
+import time
+
+class Job():
+ def __init__(self, func, args=[], kwargs={}, name=None):
+ if name == None:
+ name = 'job'
+ self.id = None
+ self.name = name
+ self.func = func
+ self.args = args
+ self.kwargs = kwargs
+ self.results = None
+
+ def run(self):
+ self.results = self.func(*self.args, **self.kwargs)
+
+ def set_name(self, name):
+ self.name = name
+
+ def set_id(self, jid):
+ self.id = jid
+
+ def __call__(self):
+ self.run()
+
+class Worker(threading.Thread):
+ def __init__(self, work_queue, finished_queue):
+ super(Worker, self).__init__()
+ self.queue = work_queue
+ self.finished = finished_queue
+ self.terminate = False
+ self.daemon=True
+
+ def stop(self):
+ self.terminate = True
+
+ def run(self):
+ while not self.terminate:
+ try:
+ task = self.queue.get(timeout=1)
+ task.run()
+ self.queue.task_done()
+ self.finished.put(task)
+ except queue.Empty:
+ pass
+ except KeyboardInterrupt:
+ print("you stop the threading")
+
+class ParallelHost():
+ def __init__(self, num_threads=8):
+ self.num_threads = num_threads
+ self.workers = []
+ self.tasks = queue.Queue()
+ self.results = queue.Queue()
+ self.rets = {}
+ self.id = 0
+ for i in range(self.num_threads):
+ worker = Worker(self.tasks, self.results)
+ self.workers.append(worker)
+ for worker in self.workers:
+ worker.start()
+
+ def __del__(self):
+ self.stop('kill')
+
+ # soft stop: wait until all job done
+ # hard stop: stop even with unfinished job
+ # kill stop: whatever the thread is doing, exit.
+ def stop(self, mode='soft'):
+ print('Trying to stop.')
+ if mode == 'soft':
+ self.tasks.join()
+ print('All job finished.')
+ for worker in self.workers:
+ worker.stop()
+ if mode == 'kill':
+ worker.join(0.01)
+
+ def commit(self, job):
+ self.id += 1
+ job.set_id(self.id)
+ self.tasks.put(job)
+ return self.id
+
+ def add_job(self, func, args=[], kwargs={}, name=None):
+ job = Job(func, args, kwargs, name)
+ return self.commit(job)
+
+ def collect_all(self):
+ while not self.results.empty():
+ task = self.results.get()
+ jid = task.id
+ self.rets[jid] = task.results
+
+ def get_result(self, jid, block=False):
+ if jid in self.rets:
+ ret = self.rets[jid]
+ del self.rets[jid]
+ return ret
+ while True:
+ if self.results.empty() and not block:
+ break
+ task = self.results.get()
+ if task.jid == jid:
+ return task.results
+ else:
+ self.rets[task.jid] = task.results
+
+ def clear_results(self):
+ while not self.results.empty():
+ self.results.get()
+ self.rets = {}
+
+if __name__ == '__main__':
+ host = ParallelHost()
+
+ def loop_print(info, num):
+ for i in range(num):
+ print(info + ':' + str(i))
+ time.sleep(1)
+
+ for i in range(10):
+ host.add_job(loop_print, ["loop_print_{0}".format(i), 5])
+
+ host.terminate('kill')
diff --git a/lib/parser.py b/lib/parser.py
new file mode 100644
index 0000000..d6b1504
--- /dev/null
+++ b/lib/parser.py
@@ -0,0 +1,151 @@
+from html.parser import HTMLParser
+from . import utils
+
+def dict_to_arrtibute_string(attributes):
+ string = ''
+ for key in attributes:
+ string += key + '=\"{0}\";'.format(str(attributes[key]))
+ return string
+
+def attribute_string_to_dict(attrs):
+ attr_dict = {}
+ for attr in attrs:
+ attr_dict[attr[0]] = attr[1]
+ return attr_dict
+
+
+class dom_node():
+ def __init__(self, name = None, attributes = {}):
+ if name is not None:
+ self.name = name
+ else:
+ self.name = 'Node'
+
+ self.attributes = attributes
+ self.childs = []
+ self.data = None
+ self.father = None
+
+ def add_child(self, child):
+ if child is not None:
+ child.father = self
+ self.childs.append(child)
+
+ def to_string(self, prefix='', indent=' '):
+
+ string = prefix + '<' + self.name
+ if self.attributes:
+ string += ' ' + dict_to_arrtibute_string(self.attributes)
+ string += '>\n'
+
+ for child in self.childs:
+ string += child.to_string(prefix=prefix+indent, indent=indent)
+
+ if self.data is not None:
+ string += prefix + indent + self.data + '\n'
+
+ string += prefix + '{0}>\n'.format(self.name)
+
+ return string
+
+
+ def has_child(self, name):
+ has = False
+ for child in self.childs:
+ if child.name == name:
+ has = True
+ break;
+ return has
+
+ def search(self, name):
+ founded_node = []
+ if type(name) is list:
+ if self.name in name:
+ founded_node.append(self)
+ else:
+ if self.name == name:
+ founded_node.append(self)
+ for child in self.childs:
+ search_result = child.search(name)
+ founded_node += search_result
+ return founded_node
+
+def dict2dom(d, root_name='root'):
+ node = dom_node(root_name)
+ for key in d:
+ elem = d[key]
+ child_node = dom_node(name=str(key))
+ if type(elem) is dict:
+ child_node = dict2dom(elem, root_name=str(key))
+ elif type(elem) is list:
+ for subelem in elem:
+ if type(subelem) is dict:
+ sub_node = dict2dom(subelem, root_name='li')
+ child_node.add_child(sub_node)
+ else:
+ sub_node = dom_node('li')
+ sub_node.data = str(subelem)
+ child_node.add_child(sub_node)
+ else:
+ child_node.data = str(elem)
+ node.add_child(child_node)
+ return node
+
+# if a dom node has data only, then it's {'name':'data'}
+# if a dom node has childs, then it's {'name':{}}
+# if a dom node has data as well as childs, data will be ignored.
+# if a dom has multi child with same name, it will be stored as list.
+def dom2dict(dom, replace_li = True):
+ dictionary = {}
+ for child in dom.childs:
+ name = child.name
+ content = None
+ if len(child.childs) != 0:
+ content = dom2dict(child, replace_li)
+ else:
+ content = child.data
+ if content is None:
+ content = ''
+ content = utils.clean_text(content)
+ if name in dictionary:
+ if type(dictionary[name]) is not list:
+ previous = dictionary[name]
+ dictionary[name] = [previous, content]
+ else:
+ dictionary[name].append(content)
+ else:
+ dictionary[name] = content
+
+ if replace_li:
+ for key in dictionary:
+ item = dictionary[key]
+ if type(item) is dict:
+ li = None
+ if len(item.keys()) == 1:
+ for subkey in item:
+ if subkey == 'li':
+ li = item[subkey]
+ if li is not None:
+ dictionary[key] = li
+ return dictionary
+
+class simple_parser(HTMLParser):
+ def __init__(self):
+ super(simple_parser, self).__init__()
+ self.root = dom_node('root')
+ self.current_node = self.root
+
+ def handle_starttag(self, tag, attrs):
+ attrs_dict = attribute_string_to_dict(attrs)
+ this_node = dom_node(tag, attrs_dict)
+ self.current_node.add_child(this_node)
+ self.current_node = this_node
+
+ def handle_endtag(self, tag):
+ self.current_node = self.current_node.father
+
+ def handle_data(self, data):
+ if self.current_node.data is None:
+ self.current_node.data = data
+ else:
+ self.current_node.data += data
\ No newline at end of file
diff --git a/lib/screen.py b/lib/screen.py
new file mode 100644
index 0000000..63e6191
--- /dev/null
+++ b/lib/screen.py
@@ -0,0 +1,19 @@
+import sys
+
+class VirtualScreen():
+ def __init__(self, max_history=1000):
+ self.max_history = max_history
+ self.contents = []
+
+ def write(self, message):
+ self.contents.append(message)
+
+ def last(self, line=10, output=sys.stdout):
+ num_lines = len(self.contents)
+ start_line = num_lines - line
+ if start_line < 0:
+ start_line = 0
+ display = self.contents[start_line:]
+ for line in display:
+ output.write(line)
+ output.write('\n')
\ No newline at end of file
diff --git a/lib/service.py b/lib/service.py
new file mode 100644
index 0000000..3b5f4f8
--- /dev/null
+++ b/lib/service.py
@@ -0,0 +1,244 @@
+import time
+import sys
+import shlex
+import argparse
+
+from croniter import croniter
+from . import utils
+from . import parallel
+from . import console
+from . import screen
+from . import utils
+
+class service():
+ def __init__(self, action, args=[], kwargs={}, cron='* * * * *', managed_output=False, name='service'):
+ self.name = name
+ self.action = action
+ self.managed_output = managed_output
+ self.args = args
+ self.kwargs = kwargs
+ self.output = sys.stdout
+ self.last_result = None
+ self.cronexpr = cron
+ self.croniter = croniter(self.cronexpr, time.time())
+ self.next_time = self.croniter.get_next()
+
+ def run(self, daemon=None, dry=False):
+ if not dry:
+ self.next_time = self.croniter.get_next()
+
+ new_args = []
+ if self.managed_output:
+ new_args = [self.output, *self.args]
+ else:
+ new_args = self.args
+ if daemon is None:
+ self.last_result = self.action(*new_args, **self.kwargs)
+ else:
+ daemon.add_job(self.action, new_args, self.kwargs, self.name)
+
+class ServiceManager():
+ def __init__(self, debug=False, output=sys.stdout):
+ self.debug = debug
+ self.services = {}
+ self.deleted_services = {}
+ self.protected_service = []
+ self.daemon = parallel.ParallelHost()
+ self.sid = 0
+ self.terminate = False
+ self.output = output
+
+ self.set_refresh_time()
+
+ def stop(self):
+ self.daemon.stop()
+ self.terminate = True
+
+ def __del__(self):
+ self.stop()
+
+ def log(self, *args, end='\n'):
+ self.output.write('[{0}]'.format(utils.str_time()))
+ for arg in args:
+ arg = str(arg)
+ self.output.write(arg)
+ self.output.write(end)
+
+ def add(self, service, protected=False):
+ self.sid += 1
+ service.output = self.output
+ self.services[self.sid] = service
+ if protected:
+ self.protected_service.append(self.sid)
+ return self.sid
+
+ def delete(self, sid):
+ if sid in self.protected_service:
+ self.log('Can not delete protected service.')
+ return
+ if sid in self.services:
+ self.deleted_services[sid] = self.services[sid]
+ del self.services[sid]
+ else:
+ self.log('The sid [{0}] do not exist!'.format(sid))
+
+ def recover(self, sid):
+ if sid in self.deleted_services:
+ self.services[sid] = self.deleted_services[sid]
+ del self.deleted_services[sid]
+ else:
+ self.log('The sid [{0}] is not found recycle bin.'.format(sid))
+
+ def set_refresh_time(self, refresh_cron='* * * * *'):
+ def refresh():
+ pass
+ refresh_service = service(refresh, cron=refresh_cron, name='refresh')
+ self.add(refresh_service, protected = True)
+
+ def get_next(self):
+ next_sid = -1
+ next_time = -1
+ for sid in self.services:
+ service = self.services[sid]
+ if service.next_time < next_time or next_sid < 0:
+ next_sid = sid
+ next_time = service.next_time
+ return next_sid, next_time
+
+ def loop(self):
+ while not self.terminate:
+ next_sid, next_time = self.get_next()
+ service = self.services[next_sid]
+ sleep_time = next_time - time.time()
+ if sleep_time > 0:
+ time.sleep(sleep_time)
+ self.log('Running service {0} (SID={1})'.format(service.name, next_sid))
+ if next_sid in self.services:
+ service.run(self.daemon)
+ else:
+ self.log('the sheduled service wiil not run since it is canceled.')
+
+
+ # mode: background: return immidietly
+ # foreground: stuck here.
+ def start(self, mode='background'):
+ if mode == 'background':
+ self.daemon.add_job(self.loop, name='service main loop')
+ else:
+ self.loop()
+
+def get_service_console(manager, name='service'):
+
+ con = console.console(name)
+
+ def command_show(args):
+ print('Active services:')
+ for sid in manager.services:
+ print('SID: {0} | Name: {1}'.format(sid, manager.services[sid].name))
+ print('Deleted services:')
+ for sid in manager.deleted_services:
+ print('SID: {0} | Name: {1}'.format(sid, manager.deleted_services[sid].name))
+
+ def command_add(args):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('cron', type=str, help='A cron expr')
+ parser.add_argument('task', type=str, help='task to run, should be a valid command')
+ parser.add_argument('--name', '-n', type=str, default='command service', help='name of the task')
+ args = shlex.split(args)
+ args = parser.parse_args(args)
+ cron = args.cron
+ if not croniter.is_valid(cron):
+ print('Invalid cron expression.')
+ task = args.task
+ name = args.name
+ service_to_add = service(con.execute, args=[task], cron=cron, name=name)
+ manager.add(service_to_add)
+
+ def command_delete(args):
+ sid = None
+ if args.isdigit():
+ if int(args) in manager.services:
+ sid = int(args)
+ if sid is not None:
+ manager.delete(sid)
+ else:
+ print('command arugment \"{0}\" is not understood.'.format(args))
+
+ def command_recover(args):
+ sid = None
+ if args.isdigit():
+ if int(args) in manager.deleted_services:
+ sid = int(args)
+ if sid is not None:
+ manager.recover(sid)
+ else:
+ print('command arugment \"{0}\" is not understood.'.format(args))
+
+ def command_run(args):
+ sid = None
+ if args.isdigit():
+ if int(args) in manager.services:
+ sid = int(args)
+ if sid is not None:
+ manager.services[sid].run(dry=True)
+ else:
+ print('command arugment \"{0}\" is not understood.'.format(args))
+
+ def command_info(args):
+ line = None
+ if args != '':
+ if args.isdigit():
+ line = int(args)
+ if line is None:
+ line = 10
+ manager.output.last(line)
+
+ def command_next(args):
+ next_sid, next_time = manager.get_next()
+ info = ''
+ indent = ' '
+ info += 'Next Job: {0}'.format(manager.services[next_sid].name)
+ info += '\n{0}SID: {1}'.format(indent, next_sid)
+ info += '\n{0}Scheduled Running Time: {1}'.format(indent, utils.time2str(next_time))
+ info += '\n{0}Remeaning Time: {1}s'.format(indent, utils.float2str(next_time-time.time()))
+ print(info)
+
+ con.regist('show', command_show, help_info='Show all services.', alias=['ls'])
+ con.regist('run', command_run, help_info='Run a service.')
+ con.regist('info', command_info, help_info='Display service output log.')
+ con.regist('next', command_next, help_info='Next job to run.')
+ con.regist('add', command_add, help_info='Register a command as service.')
+ con.regist('delete', command_delete, help_info='Delete a service', alias=['del'])
+ con.regist('recover', command_recover, help_info='Recover a service.')
+ return con
+
+
+if __name__ == '__main__':
+ def func1(output):
+ output.write('func1')
+
+ def func2(output):
+ output.write('func2')
+
+ def add(a, b):
+ print('{0} + {1} = {2}'.format(a, b, a+b))
+
+ def command_add(args):
+ numbers = args.split(' ')
+ a = float(numbers[0])
+ b = float(numbers[1])
+ add(a, b)
+
+ log_screen = screen.VirtualScreen()
+ manager = ServiceManager(output=log_screen)
+ test1 = service(func1, cron='* * * * *', name='test1', managed_output=True)
+ test2 = service(func2, cron='* * * * *', name='test2', managed_output=True)
+ manager.add(test1)
+ manager.add(test2)
+ manager.start('background')
+
+ con = get_service_console(manager)
+ master = console.console()
+ master.regist('service', con, help_info='service console')
+ master.regist('add', command_add, help_info='Add two numbers.')
+ master.interactive()
diff --git a/lib/try.py b/lib/try.py
new file mode 100644
index 0000000..64c44b3
--- /dev/null
+++ b/lib/try.py
@@ -0,0 +1,15 @@
+def func(a, b, c, time=0, work=1):
+ print('a:{0} b:{1} c:{2}'.format(a, b, c))
+ print('time:{0} work:{1}'.format(time, work))
+
+def funcwrap(func, kargs, kkargs):
+ func(*kargs, **kkargs)
+
+
+kargs = [1, 2, 3]
+kkargs = {
+ "time":1234,
+ "work":1232
+}
+
+funcwrap(func, kargs, kkargs)
\ No newline at end of file
diff --git a/lib/utils.py b/lib/utils.py
new file mode 100644
index 0000000..733075b
--- /dev/null
+++ b/lib/utils.py
@@ -0,0 +1,139 @@
+import pickle
+import time
+import os
+import re
+import platform
+
+def detect_platform():
+ p = 'Unknown'
+ if platform.platform().find('Windows') != -1:
+ p = 'Windows'
+ elif platform.platform().find('Linux') != -1:
+ p = 'Linux'
+ return p
+
+def ensure_dir_exist(directory, show_info = True):
+ exist = os.path.isdir(directory)
+ if not exist:
+ print('directory', directory, ' not found, creating...')
+ os.mkdir(directory)
+
+def validateTitle(title):
+ rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
+ new_title = re.sub(rstr, " ", title) # 替换为空格
+ return new_title
+
+def list2csv(l):
+ csv = ''
+ for item in l:
+ csv += str(item) + ','
+ csv = csv[:-1]
+ return csv
+
+def clean_text(string):
+ if string is None:
+ return ''
+ while '\n' in string:
+ string = string.replace('\n', ' ')
+ splits = clean_split(string)
+ string = ''
+ for split in splits:
+ string += split + ' '
+ string = string[:-1]
+ return string
+
+def clean_split(string, delimiter=' '):
+ sub_strs = string.split(delimiter)
+ splits = []
+ for sub_str in sub_strs:
+ if sub_str is not '':
+ splits.append(sub_str)
+ return splits
+
+def remove_blank_in_endpoint(string):
+ length = len(string)
+
+ first_index = 0
+ for i in range(length):
+ if is_blank(string[first_index]):
+ first_index += 1
+ else:
+ break
+
+ last_index = length - 1
+ for i in range(length):
+ if is_blank(string[last_index]):
+ last_index -= 1
+ else:
+ break
+ last_index += 1
+ return string[first_index:last_index]
+
+def is_blank(ch):
+ blank_ch = [' ', '\t', '\n']
+ if ch in blank_ch:
+ return True
+ else:
+ return False
+
+def dict_to_arrtibute_string(attributes):
+ string = ''
+ for key in attributes:
+ string += key + '=\"{0}\";'.format(str(attributes[key]))
+ return string
+
+def attribute_string_to_dict(attrs):
+ attr_dict = {}
+ for attr in attrs:
+ attr_dict[attr[0]] = attr[1]
+ return attr_dict
+
+def save_python_object(obj, save_path):
+ with open(save_path, 'wb') as file:
+ pickle.dump(obj, file)
+
+def load_python_object(path):
+ with open(path, 'rb') as file:
+ return pickle.load(file)
+
+def delete_n(string):
+ while '\n' in string:
+ string = string.replace('\n', ' ')
+ return string
+
+def remove_additional_blank(string):
+ words = string.split(' ')
+ string = ''
+ for word in words:
+ if word is not '':
+ string += word + ' '
+ return string[:-1]
+
+def formal_text(text):
+ text = delete_n(text)
+ text = remove_additional_blank(text)
+ return text
+
+def float2str(f, precision=2):
+ f = str(f)
+ f_base = f[:f.find('.') + precision]
+ return f_base
+
+# ========== time realted operation ========== #
+
+def str_day():
+ day = time.strftime("%Y-%m-%d", time.localtime())
+ return day
+
+def time2str(t):
+ localtime = time.localtime(int(t))
+ return str_time(localtime)
+
+def str_time(local_time = None):
+ if local_time is None:
+ local_time = time.localtime()
+ day = time.strftime("%Y-%m-%d-%Hh-%Mm-%Ss)", local_time)
+ return day
+
+if __name__ == '__main__':
+ print(str_day())
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..f057d0c
--- /dev/null
+++ b/main.py
@@ -0,0 +1,40 @@
+import arxiv_bot
+import email_sender
+import subscriber_utils
+import feeds
+from lib import utils
+import os
+from lib import service
+from lib.console import console
+from lib import screen
+
+subscribe_manager = subscriber_utils.subscribe_manager()
+# subscribe_manager.load()
+
+bot = arxiv_bot.arxiv_bot(subscribe_manager.get_subscribed_topics())
+feeds_generator = feeds.feed_manager(subscribe_manager, bot)
+emailer = email_sender.arxiv_emailer(bot, feeds_generator, debug=False)
+
+log_screen = screen.VirtualScreen()
+manager = service.ServiceManager(output=log_screen)
+
+daily_mail_service = service.service(
+ emailer.send_daily_email,
+ cron='0 4 * * 1-5',
+ name = 'send daily email'
+)
+manager.add(daily_mail_service)
+
+shell = console('ArxivBot')
+def command_load(args):
+ if args == 'subscriber':
+ subscribe_manager.load()
+
+shell.regist('load', command_load, help_info='load config. (only subscriber supported till now)')
+service_shell = service.get_service_console(manager, 'ServiceManager')
+shell.regist('service', service_shell, help_info='service mamager')
+
+# cron time:
+# min hour day month week
+manager.start()
+shell.interactive()
diff --git a/subscriber_utils.py b/subscriber_utils.py
new file mode 100644
index 0000000..3e12fb7
--- /dev/null
+++ b/subscriber_utils.py
@@ -0,0 +1,92 @@
+from lib.parser import dom_node, simple_parser
+
+
+class subscribe_manager():
+ def __init__(self, subscriber_config = './config/subscriber.xml'):
+ self.subscriber_config = None
+ self.subscribers = {}
+ if subscriber_config is not None:
+ self.subscriber_config = subscriber_config
+ self.load()
+
+ def show(self):
+ if self.subscribers is None:
+ print('No subscriber found!')
+ else:
+ for name in self.subscribers:
+ print('Name:', name, 'Email:', self.subscribers[name]['email'])
+
+ def load(self, path=None):
+ if path is None:
+ path = self.subscriber_config
+ if path is None:
+ return None
+ tree = None
+ with open(path, 'r') as f:
+ xml = f.read()
+ parser = simple_parser()
+ parser.feed(xml)
+ tree = parser.root
+ subscribers = {}
+ if tree is not None:
+ for person in tree.childs:
+ person_name = None
+ person_email = None
+ person_topics = []
+ person_keywords = []
+ for item in person.childs:
+ if item.name == 'name':
+ person_name = item.data
+ elif item.name == 'email':
+ person_email = item.data
+ elif item.name == 'topics':
+ for topic in item.childs:
+ if topic.name == 'topic':
+ person_topics.append(topic.data)
+ elif item.name == 'keywords':
+ for keyword in item.childs:
+ if keyword.name == 'keyword':
+ person_keywords.append(keyword.data)
+ if person_name is not None and person_email is not None and person_topics is not None:
+ subscriber = {}
+ subscriber['keywords'] = person_keywords
+ subscriber['email'] = person_email
+ subscriber['topics'] = person_topics
+ subscribers[person_name] = subscriber
+ self.subscribers = subscribers
+ print('Subscriber load success! All subscribers are shown below:')
+ self.show();
+
+ def get_subscribed_topics(self):
+ topics = []
+ for name in self.subscribers:
+ subscriber = self.subscribers[name]
+ topics += subscriber['topics']
+ topics = set(topics)
+ return topics
+
+ def get_subscribed_keywords(self):
+ keywords = []
+ for name in self.subscribers:
+ keywords += self.subscribers[name]['keywords']
+ keywords = set(keywords)
+ return keywords
+
+ def get_keywords_of_topics(self):
+ keywords_of_topics = {}
+ for name in self.subscribers:
+ subscriber = self.subscribers[name]
+ topic_group = subscriber['topics']
+ for topic in topic_group:
+ if topic not in keywords_of_topics:
+ keywords_of_topics[topic] = []
+ keywords_of_topics[topic] += subscriber['keywords']
+ return keywords_of_topics
+
+
+if __name__ == '__main__':
+ manager = subscribe_manager()
+ print(manager.subscribers)
+ print(manager.get_subscribed_topics())
+ print(manager.get_subscribed_keywords())
+ print(manager.get_keywords_of_topics())
\ No newline at end of file
diff --git a/try.py b/try.py
new file mode 100644
index 0000000..809cc6e
--- /dev/null
+++ b/try.py
@@ -0,0 +1,21 @@
+import arxiv_service
+import time
+
+now = arxiv_service.cron_time(time.localtime(time.time()))
+# now.show()
+# while True:
+# now.next_day()
+# now.show()
+
+# running time
+# minute hour day month week year
+# * means always.
+# a-b means from a to b (a and b included)
+# a means run at this time.
+# must match all to execute a command.
+
+schedule = arxiv_service.cron_expr('0 0 29 2 * *')
+for i in range(10):
+ now = schedule.next_run(now)
+ now.show()
+ print(now.to_struct_time())
\ No newline at end of file