From 1ac6d0bb9c58aef5b62f45337eb8e3b8487fa77f Mon Sep 17 00:00:00 2001 From: zmy Date: Mon, 23 Dec 2019 12:42:31 +0800 Subject: [PATCH] initial commit --- .gitignore | 7 + README.md | 22 +- analysis_paper.py | 65 ++++++ arxiv_bot.py | 85 ++++++++ arxiv_service.py | 31 +++ arxiv_spider.py | 369 ++++++++++++++++++++++++++++++++++ config-examples.py | 9 + config/style.css | 109 ++++++++++ config/subscriber_example.xml | 27 +++ download_html.py | 10 + email_sender.py | 138 +++++++++++++ feeds.py | 103 ++++++++++ lib/console.py | 304 ++++++++++++++++++++++++++++ lib/parallel.py | 127 ++++++++++++ lib/parser.py | 151 ++++++++++++++ lib/screen.py | 19 ++ lib/service.py | 244 ++++++++++++++++++++++ lib/try.py | 15 ++ lib/utils.py | 139 +++++++++++++ main.py | 40 ++++ subscriber_utils.py | 92 +++++++++ try.py | 21 ++ 22 files changed, 2126 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 analysis_paper.py create mode 100644 arxiv_bot.py create mode 100644 arxiv_service.py create mode 100644 arxiv_spider.py create mode 100644 config-examples.py create mode 100644 config/style.css create mode 100644 config/subscriber_example.xml create mode 100644 download_html.py create mode 100644 email_sender.py create mode 100644 feeds.py create mode 100644 lib/console.py create mode 100644 lib/parallel.py create mode 100644 lib/parser.py create mode 100644 lib/screen.py create mode 100644 lib/service.py create mode 100644 lib/try.py create mode 100644 lib/utils.py create mode 100644 main.py create mode 100644 subscriber_utils.py create mode 100644 try.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fbd985f --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.pyc +__pycache__/ +cache/ +feeds/ +config/email_session.xml +config/subscriber.xml +config.py \ No newline at end of file diff --git a/README.md b/README.md index eec0658..52836d4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ -# ArxivRobot +# What is This? +This is a naive and simple arxiv robot, it will fetch today's updated papers from arxiv in specified topic, filter the papers by given keywords, and send the result to a given email address. +# What i need to do to run this code? + +## package requirements: +It seems you only need to install croniter: ```pip install croniter``` will do it. + +## configuration: + +1. create config.py, a sample is given in config-examples.py +2. create config/subscriber.xml, a sample is also given in /config/subscriber_example.xml + +## run this code. +```python main.py``` + +If everything goes okay, you will see a shell interface, type help for more information. + +# PS +I am really really a bad coder and not good at writing document and comments. If you have anything in trouble, feel free to open a issue and i will try my best to fix the problem. + +The code is pushed in a hurry, i will add a document to explain this code when i have free time. diff --git a/analysis_paper.py b/analysis_paper.py new file mode 100644 index 0000000..e8a5f9f --- /dev/null +++ b/analysis_paper.py @@ -0,0 +1,65 @@ +from arxiv_spider import arxiv_paper +import utils +import numpy as np + +authors = {} + +years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'] + +for year in years: + print('Analysising year:', year) + papers = utils.load_python_object('./feeds/' + year) + for paper in papers: + author_this_paper = paper.info['authors'] + for author in author_this_paper: + author = utils.delete_n(author) + if author in authors: + authors[author] += 1 + else: + authors[author] = 1 + +freq = [] +names = [] +for author in authors: + freq.append(authors[author]) + names.append(author) + +freq = np.asarray(freq, dtype=np.int32) + +freq_sort = np.argsort(freq) + +num_authors = len(names) + +for i in range(num_authors): + aid = freq_sort[num_authors - i - 1] + print('Name: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8')) + + +# keywords = {} + +# years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'] + +# for year in years: +# print('Analysising year:', year) +# papers = utils.load_python_object('./feeds/' + year) +# for paper in papers: +# keyword_this_paper = paper.info['title'].split(' ') +# for keyword in keyword_this_paper: +# keyword = utils.delete_n(keyword).lower() +# if keyword in keywords: +# keywords[keyword] += 1 +# else: +# keywords[keyword] = 1 + +# freq = [] +# names = [] +# for keyword in keywords: +# freq.append(keywords[keyword]) +# names.append(keyword) +# freq = np.asarray(freq, dtype=np.int32) +# freq_sort = np.argsort(freq) +# num_keywords = len(names) + +# for i in range(num_keywords): +# aid = freq_sort[num_keywords - i - 1] +# print('Keyword: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8')) \ No newline at end of file diff --git a/arxiv_bot.py b/arxiv_bot.py new file mode 100644 index 0000000..b545c8f --- /dev/null +++ b/arxiv_bot.py @@ -0,0 +1,85 @@ +import arxiv_spider +import os +import time +from lib import utils + +# cache tree: +# cache_root +# - topic-caches +# - feed_$(time).arxiv_feed +# - feed_year_$(year).arxiv_feed + +class arxiv_bot(): + def __init__(self, topics, cache_dir='./cache', arxiv_site='https://arxiv.org', log=False): + self.log = log + self.site = arxiv_site + self.topics = [] + self.spiders = {} + self.cache_dir = cache_dir + self.topic_caches = {} + if not os.path.isdir(self.cache_dir): + os.makedirs(self.cache_dir) + self.update_topics(topics) + + def update_topics(self, topics): + for topic in topics: + if topic not in self.topics: + self.topics.append(topic) + if self.log: + print('Adding topic {0}.'.format(topic)) + topic_cache = os.path.join(self.cache_dir, topic) + self.topic_caches[topic] = topic_cache + if not os.path.isdir(topic_cache): + if self.log: + print('creating topic dir:', topic_cache) + os.makedirs(topic_cache) + self.spiders[topic] = arxiv_spider.arxiv_spider(topic, self.site) + + # load feed if it is already downloaded. If not, use spiders to get today's feed. + def get_today_feed(self): + today_feed = {} + today = utils.str_day() + for topic in self.topics: + today_feed_name = 'feed_' + today + '.arxiv_daily_feed' + today_feed_path = os.path.join(self.cache_dir, topic, today_feed_name) + cache_dir = self.topic_caches[topic] + topic_feed = None + if os.path.exists(today_feed_path): + topic_feed = utils.load_python_object(today_feed_path) + else: + topic_feed = self.spiders[topic].get_today_paper() + print('Fetching topic {0} papers...'.format(topic)) + for paper in topic_feed: + if self.log: + print('download abstract for paper', paper.info['title']) + paper.download_abstract() + utils.save_python_object(topic_feed, today_feed_path) + today_feed[topic] = topic_feed + return today_feed + + def get_interested_paper(self, topic, keywords): + if self.today_feed is None or utils.str_day() is not self.today: + self.today_feed = self.get_today_feed() + self.today = utils.str_day() + print('Updating daily feed.') + + topic_feed = self.today_feed[topic] + topic_papers = [] + for day in topic_feed: + topic_papers += topic_feed[day] + strong = [] + weak = [] + for paper in topic_papers: + strong_match = False + weak_match = False + for keyword in keywords: + if paper.info['title'].lower().find(keyword) != -1: + strong_match = True + break + elif paper.info['abstract'].lower().find(keyword) != -1: + weak_match = True + if strong_match: + strong.append(paper) + elif weak_match: + weak.append(paper) + return strong, weak diff --git a/arxiv_service.py b/arxiv_service.py new file mode 100644 index 0000000..c2af06d --- /dev/null +++ b/arxiv_service.py @@ -0,0 +1,31 @@ +import arxiv_bot +import feeds +import email_sender +import time +import subscriber_utils +import utils +from croniter import croniter +import threading + +class test_service(): + def __init__(self, name): + self.name = name + pass + + def do(self): + print('Job {0} run!'.format(self.name)) + +class mail_service(): + def __init__(self, emailer): + self.emailer = emailer + + def do(self): + self.emailer.send_daily_email() + +class reload_subscriber(): + def __init__(self, subscriber_mgr): + self.mgr = subscriber_mgr + + def do(self): + self.mgr.load() + diff --git a/arxiv_spider.py b/arxiv_spider.py new file mode 100644 index 0000000..e1ae13d --- /dev/null +++ b/arxiv_spider.py @@ -0,0 +1,369 @@ +import requests +import pickle +import time +from lib import utils +from lib.parser import dom_node, simple_parser + +import socket +import socks + +use_proxy = False +if use_proxy: + SOCKS5_PROXY_HOST = '127.0.0.1' + SOCKS5_PROXY_PORT = 1080 + default_socket = socket.socket + socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT) + socket.socket = socks.socksocket + +class arxiv_paper(): + def __init__(self, arxiv_id = None, paper_info = None): + self.arxiv_id = arxiv_id + self.info = paper_info + + def add_author(self, author): + self.info['authors'].append(authors) + + def title(self): + return self.info['title'] + + + def describe(self): + information = '' + information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id) + for key in self.info: + if self.info[key] is not None: + info = utils.formal_text(self.info[key]) + information += ('\t' + key + ':' + str(info) + '\n') + return information + + def show(self): + print(self.describe()) + + def to_html(self): + dom_tree = dom_node(name = 'paper-section') + paper_title = None + paper_link = None + paper_authors = None + paper_comments = None + paper_subjects = None + paper_abstract = None + for key in self.info: + if self.info[key] is not None: + if key == 'title': + paper_title = dom_node('paper-title') + link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)} + link_node = dom_node('a', link_attr) + link_node.data = self.info[key] + paper_title.add_child(link_node) + paper_link = dom_node('paper-pdf-link') + pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)} + pdf_link = dom_node('a', pdf_link_attr) + pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id) + paper_link.add_child(pdf_link) + + elif key == 'authors': + paper_authors = dom_node('paper-authors') + authors_string = '' + for author in self.info[key]: + authors_string += author + ', ' + authors_string = authors_string[:-2] + paper_authors.data = authors_string + + elif key == 'comments': + paper_comments = dom_node('paper-comments') + paper_comments.data = self.info[key] + + elif key == 'subjects': + paper_subjects = dom_node('paper-subjects') + paper_subjects.data = self.info[key] + + elif key == 'abstract': + paper_abstract = dom_node('paper-abstract') + paper_abstract.data = self.info[key] + dom_tree.add_child(paper_title) + dom_tree.add_child(paper_link) + dom_tree.add_child(paper_authors) + dom_tree.add_child(paper_abstract) + dom_tree.add_child(paper_comments) + dom_tree.add_child(paper_subjects) + html = dom_tree.to_string() + return html + + def download_abstract(self, forcemode=False): + if not forcemode: + if self.info['abstract'] is not None: + # print('skipping download abstract since already downloaded') + return; + r = requests.get('https://arxiv.org/abs/' + self.arxiv_id) + parser = simple_parser() + parser.feed(r.text) + tree = parser.root + meta_nodes = tree.search('meta') + for meta_node in meta_nodes: + meta_attr = meta_node.attributes + if 'property' in meta_attr: + if meta_attr['property'] == 'og:description': + self.info['abstract'] = utils.formal_text(meta_attr['content']) + return; + +class arxiv_list_parser(): + def __init__(self, html_page): + self.html_page = html_page + self.parser = simple_parser() + self.parser.feed(html_page) + self.tree = self.parser.root + + def get_arxiv_id(self, dt_node): + if len(dt_node.childs) == 0: + return None + else: + arxiv_id = dt_node.childs[1].childs[0].attributes['href'] + arxiv_id = arxiv_id.split('/')[-1] + return arxiv_id + + def get_paper_info(self, dd_node): + title = None + authors = [] + comments = None + subjects = None + if len(dd_node.childs) == 0: + return None + else: + elements = dd_node.childs[0].childs + for element in elements: + if 'class' in element.attributes: + element_class = element.attributes['class'] + if element_class == 'list-title mathjax': + title = utils.formal_text(element.data) + elif element_class == 'list-authors': + for child in element.childs: + if child.name == 'a': + authors.append(utils.formal_text(child.data)) + elif element_class == 'list-comments mathjax': + comments = utils.formal_text(element.data) + elif element_class == 'list-subjects': + subjects = utils.formal_text(element.data) + paper_info = { + 'title':title, + 'authors':authors, + 'comments':comments, + 'subjects':subjects, + 'abstract':None + } + return paper_info + + def get_papers(self): + dts = self.tree.search('dt') + dds = self.tree.search('dd') + papers = [] + for dt, dd in zip(dts, dds): + arxiv_id = self.get_arxiv_id(dt) + if arxiv_id == None: + continue; + paper_info = self.get_paper_info(dd) + if paper_info == None: + continue; + paper = arxiv_paper(arxiv_id, paper_info) + papers.append(paper) + return papers + + def get_paper_num(self): + totally_paper_node = self.tree.search('small')[0].data + total_num_split = totally_paper_node.split(' ') + num_total = 0 + for split in total_num_split: + if split.isdigit(): + num_total = int(split) + break; + return num_total + + def get_recent_info(self): + # get each day start id and day_name + day_name = [] + day_start = [] + li_nodes = self.tree.search('ul')[0].childs + for li in li_nodes: + link = li.childs[0].attributes['href'] + start = None + if link.find('#item') != -1: + start = link.split('#')[-1][4:] + else: + start = link.split('=')[-2].split('&')[0] + day_name.append(li.childs[0].data) + day_start.append(int(start)) + # get total paper num + num_total = self.get_paper_num() + # get each day num. + num_days = len(day_start) + day_num = [] + for i in range(num_days): + if i < num_days - 1: + day_num.append(day_start[i+1] - day_start[i]) + else: + day_num.append(num_total - day_start[i]) + + # generate final info. + recent_papers_info = {} + for day, start, num in zip(day_name, day_start, day_num): + current_day_info = {} + current_day_info['start'] = start + current_day_info['num'] = num + recent_papers_info[day] = current_day_info + return recent_papers_info + +class arxiv_spider(): + def __init__(self, topic, arxiv_url = 'https://arxiv.org'): + self.link = arxiv_url + self.topic = topic + self.base_url = self.link + '/list/' + self.topic + + + def get_yearly_papers(self, year, log=False): + yearly_url = self.base_url + '/' + year + if log: + print('visiting url [{0}] for basic information'.format(yearly_url)) + r = requests.get(yearly_url) + list_parser = arxiv_list_parser(r.text) + total_num = list_parser.get_paper_num() + print('Total Number for this year:', total_num) + yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num) + if log: + print('visiting url [{0}] for all papers'.format(yearly_url_all)) + r = requests.get(yearly_url_all) + list_parser = arxiv_list_parser(r.text) + yearly_papers = list_parser.get_papers() + return yearly_papers + + # papers: + # papers = { + # 'key is day string': [content is a list of arxiv_paper class] + # } + + def get_papers_on_search_list(self, search_url, log=True): + if log: + print('visiting url [{0}] for today papers.'.format(search_url)) + search_content = requests.get(search_url) + search_content = search_content.text + parser = simple_parser() + parser.feed(search_content) + tree = parser.root + paper_nodes = tree.search('entry') + print('num_searched_nodes:', len(paper_nodes)) + papers = [] + for node in paper_nodes: + arxiv_id = node.search('id')[0].data.split('/')[-1] + title = node.search('title')[0].data + author_nodes = node.search('name') + authors = [item.data for item in author_nodes] + category_nodes = node.search('category') + categories = [item.attributes['term'] for item in category_nodes] + subjects = '' + for cat in categories: + subjects += cat + ',' + subjects = subjects[:-1] + comments_node = node.search('arxiv:comment') + if len(comments_node) == 0: + comments = '' + else: + comments = node.search('arxiv:comment')[0].data + abstract = node.search('summary')[0].data + + title = utils.formal_text(title) + subjects = utils.formal_text(subjects) + comments = utils.formal_text(comments) + abstract = utils.formal_text(abstract) + + + paper_info = { + 'title':title, + 'authors':authors, + 'comments':comments, + 'subjects':subjects, + 'abstract':abstract + } + + paper = arxiv_paper(arxiv_id, paper_info) + papers.append(paper) + return papers + + def get_papers_by_ids(self, ids, log=True): + num_groups = int((len(ids) + 9.1)/10) + if log: + print('spliting into {0} groups.'.format(num_groups)) + papers = [] + for i in range(num_groups): + this_batch = ids[i * 10:(i+1)*10] + id_list = '' + for paper_id in this_batch: + id_list += paper_id + ',' + id_list = id_list[:-1] + search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list + batch_papers = self.get_papers_on_search_list(search_url, log) + papers += batch_papers + return papers + + + def get_today_ids(self, log=True): + rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic) + if log: + print('visiting url [{0}] for today papers id.'.format(rss_url)) + rss_content = requests.get(rss_url) + rss_content = rss_content.text + parser = simple_parser() + parser.feed(rss_content) + rss = parser.root + id_nodes = rss.search('rdf:li') + paper_ids = [] + for node in id_nodes: + paper_link = node.attributes['rdf:resource'] + paper_id = paper_link.split('/')[-1] + paper_ids.append(paper_id) + print('num_paper_ids:', len(paper_ids)) + return paper_ids + + def get_today_paper(self, return_day_name=False, log=True): + today_ids = self.get_today_ids(log) + papers = self.get_papers_by_ids(today_ids) + print('num of papers:', len(papers)) + return papers + + + + def get_today_paper_backup(self, return_day_name=False): + papers = self.get_recent_papers(recent_days=[1]) + today = None + paper = None + for day in papers: + today = day + paper = papers[day] + if return_day_name: + return paper, today + else: + return paper + + + def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False): + recent_url = self.base_url + '/recent' + if log: + print('visiting url [{0}] for basic information'.format(recent_url)) + r = requests.get(recent_url) + list_parser = arxiv_list_parser(r.text) + recent_papers_info = list_parser.get_recent_info() + print('paper info:', recent_papers_info) + + day_id = 1 + papers = {} + for day in recent_papers_info: + if day_id in recent_days: + today_start = recent_papers_info[day]['start'] + today_num = recent_papers_info[day]['num'] + page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num) + day_url = self.base_url + page_url + if log: + print('visiting url [{0}] for paper on day {1}'.format(day_url, day)) + r = requests.get(day_url) + list_parser = arxiv_list_parser(r.text) + today_papers = list_parser.get_papers() + papers[day] = today_papers + day_id += 1 + return papers diff --git a/config-examples.py b/config-examples.py new file mode 100644 index 0000000..e617471 --- /dev/null +++ b/config-examples.py @@ -0,0 +1,9 @@ + +#### email related config #### +username = 'email@email.com' # send email using this account +password = 'yourpassword' # your email login passoword +sender_name = 'ArxivRobot' # the name of your robot +replyto = 'yourmail@mail.com' # all replay email will be foreward to this email address + +smtp_ssl_addr = 'smtp.smtp.com' +# smtp server, only ssl supported. you can support more by editing function send_main in email_sender.py diff --git a/config/style.css b/config/style.css new file mode 100644 index 0000000..62e1cdc --- /dev/null +++ b/config/style.css @@ -0,0 +1,109 @@ + + + \ No newline at end of file diff --git a/config/subscriber_example.xml b/config/subscriber_example.xml new file mode 100644 index 0000000..09a65e9 --- /dev/null +++ b/config/subscriber_example.xml @@ -0,0 +1,27 @@ + + name1 + mail1@mail.com + + cs.CV + cs.LG + stat.ML + + + keyword1 + keyword2 + + + + + name2 + mail2@mail.com + + cs.CV + cs.LG + stat.ML + + + keyword1 + keyword2 + + diff --git a/download_html.py b/download_html.py new file mode 100644 index 0000000..d2a02c2 --- /dev/null +++ b/download_html.py @@ -0,0 +1,10 @@ +import requests +from html.parser import HTMLParser + +# r = requests.get('https://arxiv.org/list/cs.CV/recent') +r = requests.get('https://arxiv.org/list/cs.CV/recent') +# r = requests.get('http://xxx.itp.ac.cn/list/cs.CV/recent') +# r = requests.get('https://arxiv.org/list/cs.CV/pastweek?skip=25&show=25') + +# print(r.status_code) +print(r.text) diff --git a/email_sender.py b/email_sender.py new file mode 100644 index 0000000..a98afe5 --- /dev/null +++ b/email_sender.py @@ -0,0 +1,138 @@ +from lib.parser import dom_node, simple_parser +from lib import parser +from lib import utils +import os +import config + +import smtplib +import email +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.mime.image import MIMEImage +from email.mime.base import MIMEBase +from email.mime.application import MIMEApplication +from email.header import Header +from email import generator +def send_mail(reciver, title, content): + + # with open('email.html', 'w', encoding="utf-8") as f: + # f.write(content) + # 20:24 + + username = config.username + password = config.password + replyto = config.replyto + msg = MIMEMultipart('alternative') + msg['Subject'] = Header(title) + msg['From'] = '%s <%s>' % (Header(config.sender_name), username) + msg['To'] = reciver + msg['Reply-to'] = replyto + msg['Message-id'] = email.utils.make_msgid() + msg['Date'] = email.utils.formatdate() + texthtml = MIMEText(content, _subtype='html', _charset='UTF-8') + msg.attach(texthtml) + + # with open('email.eml', 'w') as outfile: + # gen = generator.Generator(outfile) + # gen.flatten(msg) + + try: + client = smtplib.SMTP_SSL(config.smtp_ssl_addr) + # client.connect('smtpdm-ap-southeast-1.aliyun.com', 80) + client.set_debuglevel(0) + client.login(username, password) + client.sendmail(username, reciver, msg.as_string()) + + client.quit() + print ('Email send to {0} success!'.format(reciver)) + return True + except smtplib.SMTPConnectError as e: + print ('Connection Error:', e.smtp_code, e.smtp_error) + except smtplib.SMTPAuthenticationError as e: + print ('Authentication Error:', e.smtp_code, e.smtp_error) + except smtplib.SMTPSenderRefused as e: + print ('Sender Refused:', e.smtp_code, e.smtp_error) + except smtplib.SMTPRecipientsRefused as e: + print ('SMTPRecipients Refused:', e.smtp_code, e.smtp_error) + except smtplib.SMTPDataError as e: + print ('Data Error:', e.smtp_code, e.smtp_error) + except smtplib.SMTPException as e: + print ('SMTPException:', e.message) + except Exception as e: + print ('Unknown error:', str(e)) + return True + +class arxiv_emailer(): + def __init__(self, arxiv_bot, feeds_generator, session_file = './config/email_session.xml', debug=False): + self.debug = debug + self.email_info = dom_node() + self.session_file = session_file + self.sessions = None + if self.session_file is not None: + self.load_session() + + self.bot = arxiv_bot + self.feeds = feeds_generator + + def send_daily_email(self): + emails = self.feeds.generate_daily_emails() + today = utils.str_day() + for name in emails: + email = emails[name] + send = False + if name not in self.sessions: + print('New user found!') + self.sessions[name] = {} + self.sessions[name]['last-send'] = today + send = True + + if self.sessions[name]['last-send'] != today: + send = True + + if send: + print('Sending email to user {0} [{1}]'.format(name, email['reciver'])) + print('reciver:', email['reciver']) + print('title:', email['title']) + print('content:', len(email['content'])) + success = False + if not self.debug: + success = send_mail(email['reciver'], email['title'], email['content']) + if success: + self.sessions[name]['last-send'] = today + self.save_session() + else: + print('skipping user {0} since already sent!'.format(name)) + + def load_session(self, session_file=None): + if session_file is None: + session_file = self.session_file + tree = None + with open(session_file, 'r') as f: + xml = f.read() + xmlparser = simple_parser() + xmlparser.feed(xml) + tree = xmlparser.root + sessions = parser.dom2dict(tree) + if 'root' in sessions: + sessions = sessions['root'] + else: + sessions = {} + self.sessions = sessions + print(self.sessions) + return sessions + + def save_session(self, session_file=None): + if session_file is None: + session_file = self.session_file + if session_file is None: + return None + xml = parser.dict2dom(self.sessions).to_string() + with open(session_file, 'w') as f: + f.write(xml) + return xml + +if __name__ == '__main__': + emailer = arxiv_emailer(None, None, None) + emailer.send_daily_email() + print(emailer.load_session()) + print(emailer.save_session()) diff --git a/feeds.py b/feeds.py new file mode 100644 index 0000000..4e3c450 --- /dev/null +++ b/feeds.py @@ -0,0 +1,103 @@ +from lib.parser import dom_node +from lib import utils + + +class feed_manager(): + def __init__(self, submgr, arxivbot, style='./config/style.css'): + self.style_path = style + self.style = '' + self.bot = arxivbot + self.submgr = submgr + self.update_style() + + def update_style(self, path = None): + if path is None: + path = self.style_path + print('loading style from:', path) + with open(path, 'r') as f: + self.style = f.read() + self.style += '\n' + + def fetch_today_feed(self): + self.today_feed = self.bot.get_today_feed() + + def filter_papers_for_user(self, subscriber): + strong_papers = [] + weak_papers = [] + keywords = subscriber['keywords'] + papers = [] + for topic in subscriber['topics']: + if topic in self.today_feed: + papers += self.today_feed[topic] + else: + print('Warning: topic {0} is subscribed but not downloaded!'.format(topic)) + known_ids = [] + unique_papers = [] + for paper in papers: + paper_id = paper.arxiv_id + if paper_id not in known_ids: + unique_papers.append(paper) + known_ids.append(paper_id) + print('removing {0} repeated papers.'.format(len(papers) - len(unique_papers))) + papers = unique_papers + for paper in papers: + strong = False + weak = False + for keyword in keywords: + if paper.info['title'].lower().find(keyword) != -1: + strong = True + break; + elif paper.info['abstract'].lower().find(keyword) != -1: + weak = True + if strong: + strong_papers.append(paper) + elif weak: + weak_papers.append(paper) + return strong_papers, weak_papers + + def generate_group_feed(self, paper_groups): + group_html = '' + for key in paper_groups: + header = dom_node('paper-group') + header.data = key + group_html += header.to_string() + '\n' + for paper in paper_groups[key]: + group_html += paper.to_html() + '\n' + return group_html + + def generate_daily_feed_by_matched_paper(self, strong_interested, weak_interested): + feeds = {} + if len(strong_interested) > 0: + feeds['Strong Interested Paper'] = strong_interested + if len(weak_interested) > 0: + feeds['Weak Interested Paper'] = weak_interested + xml_feed = self.generate_group_feed(feeds) + return xml_feed + + def generate_daily_email_by_matched_paper(self, strong_interested, weak_interested): + xml_feed = self.generate_daily_feed_by_matched_paper(strong_interested, weak_interested) + email_content = '' + if xml_feed != '': + email_content = self.style + xml_feed + return email_content + + def generate_daily_emails(self): + self.fetch_today_feed() + emails = {} + # email is a dict, containing title, reciver and content. + today = utils.str_day() + for name in self.submgr.subscribers: + subscriber = self.submgr.subscribers[name] + strong, weak = self.filter_papers_for_user(subscriber) + content = self.generate_daily_email_by_matched_paper(strong, weak) + reciver = subscriber['email'] + if content == '': + print('Skipping user {0} [{1}] since no paper matched.'.format(name, reciver)) + continue; + title = "Your Interested Paper On Arxiv Today ({0})".format(today) + email = {} + email['reciver'] = reciver + email['title'] = title + email['content'] = content + emails[name] = email + return emails diff --git a/lib/console.py b/lib/console.py new file mode 100644 index 0000000..3ee7203 --- /dev/null +++ b/lib/console.py @@ -0,0 +1,304 @@ +from . import utils +import os +import traceback + +class console(): + def __init__(self, name='base'): + self.name = name + self.hint = '$ ' + self.exit_cmd = ['exit', 'quit', 'bye'] + self.exit_info = 'Bye~' + self.commands = {} + self.alias = {} + self.warn_level = 4 + self.exit_flag = False + self.debug = True + self.platform = utils.detect_platform() + self.is_child = False + self.father = None + + self.regist_internal_command() + + def get_hint(self): + if self.platform == 'Linux': + hint = '\033[0;33m({0})\033[0;31m{1}\033[0m'.format(self.name, self.hint) + else: + hint = '({0}){1}'.format(self.name, self.hint) + return hint + + def regist_internal_command(self): + self.regist( + 'help', + action=self.command_help, + alias=['h'], + help_info='display this help info.', + kind='sys' + ) + self.regist( + 'exit', + action=self.command_exit_console, + alias=['quit','bye'], + help_info='exit current console.', + kind='sys' + ) + self.regist( + 'cls', + action=self.command_clear_screen, + alias=['clear', 'clc'], + help_info='clear screen.', + kind='sys' + ) + self.regist( + 'alias', + action=self.command_alias, + help_info='display alias info or create new alias.', + kind='sys' + ) + self.regist( + 'os', + action=self.command_os, + help_info='run a system command.', + kind='sys' + ) + + + def translate_command(self, command): + while command in self.alias and command not in self.commands: + command = self.alias[command] + return command + + def find_equal_command(self, command, ret_type = str, ignored = []): + finished = [] + new = [] + + cmds = [command] + while len(finished) != len(cmds): + # find child + if command in self.alias: + if self.alias[command] not in cmds: + cmds.append(self.alias[command]) + # find fathers + for al in self.alias: + if self.alias[al] == command: + if al not in cmds: + cmds.append(al) + # found finished. + finished.append(command) + for cmd in cmds: + if cmd not in finished: + command = cmd + + + if ret_type is str: + finished = utils.list2csv(finished) + return finished + + + + def get_alias(self, command, ret_type=str): + alias = [] + for al in self.alias: + if self.alias[al] == command: + alias.append(al) + + if ret_type is str: + alias = utils.list2csv(alias) + + return alias + + def command_exist(self, command): + if command in self.commands or command in self.alias: + return True + else: + return False + + def add_alias(self, command, alias): + if self.command_exist(alias): + if warn_level >= 3: + print('Alias {0} will not be added since already used'.format(al)) + else: + self.alias[alias] = command + + # kind: standard or shared + # standard: help info will be displayed + # shared: help info will not be displayed in sub command. + def regist(self, command, action, alias=None, help_info='no help provided.', kind='standard'): + if type(action) == console: + action.is_child = True + action.father = self + exist = self.command_exist(command) + if exist: + if self.warn_level >=3: + print('Command {0} will not be added sinece already exist.'.format(command)) + return + + if type(alias) is list: + for al in alias: + self.add_alias(command, al) + elif type(alias) is str: + self.add_alias(command, alias) + elif alias is None: + pass + else: + if self.warn_level > 3: + print('Unknown alias type, no alias will be added.') + self.commands[command] = {} + self.commands[command]['action'] = action + self.commands[command]['help'] = help_info + self.commands[command]['kind'] = kind + + def handle_command(self, command, args): + if command in self.commands: + act = self.commands[command]['action'] + try: + act(args) + except KeyboardInterrupt: + pass + except: + print('Exception occured while processing command \"{0} {1}\".'.format(command, args)) + print('More information are shown below.\n', traceback.format_exc()) + else: + print('Unknown command \"{0}\"'.format(command)) + + # seperate command and its args. + def parse_command(self, string): + string += ' ' + length = len(string) + command_end = 0 + parse_start = False + for i in range(length): + blank = utils.is_blank(string[i]) + if not blank: + parse_start=True + if parse_start and blank: + command_end = i + break + + command = string[:command_end] + command = utils.remove_blank_in_endpoint(command) + args = utils.remove_blank_in_endpoint(string[command_end:]) + return command, args + + def parse(self, string): + command, args = self.parse_command(string) + exitsted_commands = [] + while command in self.alias: + if command not in exitsted_commands: + exitsted_commands.append(command) + command = self.alias[command] + string = command + ' ' + args + command, args = self.parse_command(string) + else: + break + + return command, args + + + def show_help_info(self, command, prefix, indent, depth=0): + command = self.translate_command(command) + action = self.commands[command]['action'] + kind = self.commands[command]['kind'] + if kind == 'sys' and depth > 0: + return + alias = self.get_alias(command, ret_type=str) + if alias != '': + print('{0}{1}({2}):'.format(prefix, command, alias)) + else: + print('{0}{1}:'.format(prefix, command)) + print('{0}{1}{2}'.format(prefix, indent, self.commands[command]['help'])) + if type(action) == console: + action.command_help('', prefix=prefix+indent, indent=indent, depth=depth+1) + + def debug_log(self, command, args): + if self.debug: + print('command:[{0}] args:[{1}]'.format(command, args)) + + def command_exit_console(self, args): + if not self.is_child: + print(self.exit_info) + self.exit_flag = True + + def command_clear_screen(self, args): + if self.platform == 'Windows': + os.system('cls') + elif self.platform == 'Linux': + os.system('clear') + return False + + def command_help(self, args, prefix = '', indent=' ', depth=0): + command, args = self.parse_command(args) + if command is not "": + if self.command_exist(command): + self.show_help_info(command, prefix, indent, depth) + else: + print('Unknown command \"{0}\"'.format(command)) + else: + for command in self.commands: + self.show_help_info(command, prefix, indent, depth) + + def command_alias(self, args): + alias_parse = args.split('=') + if len(alias_parse) == 2: + alias = utils.remove_blank_in_endpoint(alias_parse[0]) + command = utils.remove_blank_in_endpoint(alias_parse[1]) + if command is not '': + self.alias[alias]=command + else: + del self.alias[alias] + elif args == '': + for alias in self.alias: + print('{0}={1}'.format(alias, self.alias[alias])) + elif len(alias_parse) == 1: + if args in self.alias: + print('{0}={1}'.format(args, self.alias[args])) + equal_alias = self.find_equal_command(args) + if equal_alias != '': + print('Hint: {0} are all equivalent.'.format(equal_alias)) + elif args in self.commands: + als = self.get_alias(args, ret_type=str) + if als == '': + print('command {0} has no alias.'.format(args)) + else: + print('command {0} is aliased as {1}'.format(args, als)) + equal_alias = self.find_equal_command(args) + if equal_alias != '' and equal_alias != args: + print('Hint: {0} are all equivalent.'.format(equal_alias)) + else: + print('No alias \"{0}\" found.'.format(args)) + else: + print('Syntax error, command not understood.') + + def command_os(self, args): + if args == '': + print('please specify os command') + else: + os.system(args) + + def execute(self, string): + command, args = self.parse(string) + if command is not "": + self.handle_command(command, args) + + def __call__(self, args): + if args != '': + self.execute(args) + else: + self.exit_flag=False + self.interactive() + + def interactive(self): + while not self.exit_flag: + try: + input_str = input(self.get_hint()) + self.execute(input_str) + except(KeyboardInterrupt): + print('') + + +if __name__ == '__main__': + con = console() + con_sub = console() + con_sub_sub = console() + con_sub.regist('test_subsubcommand', con_sub_sub, alias='tss', help_info='A sub command.') + con.regist('test_subcommand', con_sub, alias='ts', help_info='A sub command.') + con.interactive() \ No newline at end of file diff --git a/lib/parallel.py b/lib/parallel.py new file mode 100644 index 0000000..1b1fe74 --- /dev/null +++ b/lib/parallel.py @@ -0,0 +1,127 @@ +import threading +import queue +import time + +class Job(): + def __init__(self, func, args=[], kwargs={}, name=None): + if name == None: + name = 'job' + self.id = None + self.name = name + self.func = func + self.args = args + self.kwargs = kwargs + self.results = None + + def run(self): + self.results = self.func(*self.args, **self.kwargs) + + def set_name(self, name): + self.name = name + + def set_id(self, jid): + self.id = jid + + def __call__(self): + self.run() + +class Worker(threading.Thread): + def __init__(self, work_queue, finished_queue): + super(Worker, self).__init__() + self.queue = work_queue + self.finished = finished_queue + self.terminate = False + self.daemon=True + + def stop(self): + self.terminate = True + + def run(self): + while not self.terminate: + try: + task = self.queue.get(timeout=1) + task.run() + self.queue.task_done() + self.finished.put(task) + except queue.Empty: + pass + except KeyboardInterrupt: + print("you stop the threading") + +class ParallelHost(): + def __init__(self, num_threads=8): + self.num_threads = num_threads + self.workers = [] + self.tasks = queue.Queue() + self.results = queue.Queue() + self.rets = {} + self.id = 0 + for i in range(self.num_threads): + worker = Worker(self.tasks, self.results) + self.workers.append(worker) + for worker in self.workers: + worker.start() + + def __del__(self): + self.stop('kill') + + # soft stop: wait until all job done + # hard stop: stop even with unfinished job + # kill stop: whatever the thread is doing, exit. + def stop(self, mode='soft'): + print('Trying to stop.') + if mode == 'soft': + self.tasks.join() + print('All job finished.') + for worker in self.workers: + worker.stop() + if mode == 'kill': + worker.join(0.01) + + def commit(self, job): + self.id += 1 + job.set_id(self.id) + self.tasks.put(job) + return self.id + + def add_job(self, func, args=[], kwargs={}, name=None): + job = Job(func, args, kwargs, name) + return self.commit(job) + + def collect_all(self): + while not self.results.empty(): + task = self.results.get() + jid = task.id + self.rets[jid] = task.results + + def get_result(self, jid, block=False): + if jid in self.rets: + ret = self.rets[jid] + del self.rets[jid] + return ret + while True: + if self.results.empty() and not block: + break + task = self.results.get() + if task.jid == jid: + return task.results + else: + self.rets[task.jid] = task.results + + def clear_results(self): + while not self.results.empty(): + self.results.get() + self.rets = {} + +if __name__ == '__main__': + host = ParallelHost() + + def loop_print(info, num): + for i in range(num): + print(info + ':' + str(i)) + time.sleep(1) + + for i in range(10): + host.add_job(loop_print, ["loop_print_{0}".format(i), 5]) + + host.terminate('kill') diff --git a/lib/parser.py b/lib/parser.py new file mode 100644 index 0000000..d6b1504 --- /dev/null +++ b/lib/parser.py @@ -0,0 +1,151 @@ +from html.parser import HTMLParser +from . import utils + +def dict_to_arrtibute_string(attributes): + string = '' + for key in attributes: + string += key + '=\"{0}\";'.format(str(attributes[key])) + return string + +def attribute_string_to_dict(attrs): + attr_dict = {} + for attr in attrs: + attr_dict[attr[0]] = attr[1] + return attr_dict + + +class dom_node(): + def __init__(self, name = None, attributes = {}): + if name is not None: + self.name = name + else: + self.name = 'Node' + + self.attributes = attributes + self.childs = [] + self.data = None + self.father = None + + def add_child(self, child): + if child is not None: + child.father = self + self.childs.append(child) + + def to_string(self, prefix='', indent=' '): + + string = prefix + '<' + self.name + if self.attributes: + string += ' ' + dict_to_arrtibute_string(self.attributes) + string += '>\n' + + for child in self.childs: + string += child.to_string(prefix=prefix+indent, indent=indent) + + if self.data is not None: + string += prefix + indent + self.data + '\n' + + string += prefix + '\n'.format(self.name) + + return string + + + def has_child(self, name): + has = False + for child in self.childs: + if child.name == name: + has = True + break; + return has + + def search(self, name): + founded_node = [] + if type(name) is list: + if self.name in name: + founded_node.append(self) + else: + if self.name == name: + founded_node.append(self) + for child in self.childs: + search_result = child.search(name) + founded_node += search_result + return founded_node + +def dict2dom(d, root_name='root'): + node = dom_node(root_name) + for key in d: + elem = d[key] + child_node = dom_node(name=str(key)) + if type(elem) is dict: + child_node = dict2dom(elem, root_name=str(key)) + elif type(elem) is list: + for subelem in elem: + if type(subelem) is dict: + sub_node = dict2dom(subelem, root_name='li') + child_node.add_child(sub_node) + else: + sub_node = dom_node('li') + sub_node.data = str(subelem) + child_node.add_child(sub_node) + else: + child_node.data = str(elem) + node.add_child(child_node) + return node + +# if a dom node has data only, then it's {'name':'data'} +# if a dom node has childs, then it's {'name':{}} +# if a dom node has data as well as childs, data will be ignored. +# if a dom has multi child with same name, it will be stored as list. +def dom2dict(dom, replace_li = True): + dictionary = {} + for child in dom.childs: + name = child.name + content = None + if len(child.childs) != 0: + content = dom2dict(child, replace_li) + else: + content = child.data + if content is None: + content = '' + content = utils.clean_text(content) + if name in dictionary: + if type(dictionary[name]) is not list: + previous = dictionary[name] + dictionary[name] = [previous, content] + else: + dictionary[name].append(content) + else: + dictionary[name] = content + + if replace_li: + for key in dictionary: + item = dictionary[key] + if type(item) is dict: + li = None + if len(item.keys()) == 1: + for subkey in item: + if subkey == 'li': + li = item[subkey] + if li is not None: + dictionary[key] = li + return dictionary + +class simple_parser(HTMLParser): + def __init__(self): + super(simple_parser, self).__init__() + self.root = dom_node('root') + self.current_node = self.root + + def handle_starttag(self, tag, attrs): + attrs_dict = attribute_string_to_dict(attrs) + this_node = dom_node(tag, attrs_dict) + self.current_node.add_child(this_node) + self.current_node = this_node + + def handle_endtag(self, tag): + self.current_node = self.current_node.father + + def handle_data(self, data): + if self.current_node.data is None: + self.current_node.data = data + else: + self.current_node.data += data \ No newline at end of file diff --git a/lib/screen.py b/lib/screen.py new file mode 100644 index 0000000..63e6191 --- /dev/null +++ b/lib/screen.py @@ -0,0 +1,19 @@ +import sys + +class VirtualScreen(): + def __init__(self, max_history=1000): + self.max_history = max_history + self.contents = [] + + def write(self, message): + self.contents.append(message) + + def last(self, line=10, output=sys.stdout): + num_lines = len(self.contents) + start_line = num_lines - line + if start_line < 0: + start_line = 0 + display = self.contents[start_line:] + for line in display: + output.write(line) + output.write('\n') \ No newline at end of file diff --git a/lib/service.py b/lib/service.py new file mode 100644 index 0000000..3b5f4f8 --- /dev/null +++ b/lib/service.py @@ -0,0 +1,244 @@ +import time +import sys +import shlex +import argparse + +from croniter import croniter +from . import utils +from . import parallel +from . import console +from . import screen +from . import utils + +class service(): + def __init__(self, action, args=[], kwargs={}, cron='* * * * *', managed_output=False, name='service'): + self.name = name + self.action = action + self.managed_output = managed_output + self.args = args + self.kwargs = kwargs + self.output = sys.stdout + self.last_result = None + self.cronexpr = cron + self.croniter = croniter(self.cronexpr, time.time()) + self.next_time = self.croniter.get_next() + + def run(self, daemon=None, dry=False): + if not dry: + self.next_time = self.croniter.get_next() + + new_args = [] + if self.managed_output: + new_args = [self.output, *self.args] + else: + new_args = self.args + if daemon is None: + self.last_result = self.action(*new_args, **self.kwargs) + else: + daemon.add_job(self.action, new_args, self.kwargs, self.name) + +class ServiceManager(): + def __init__(self, debug=False, output=sys.stdout): + self.debug = debug + self.services = {} + self.deleted_services = {} + self.protected_service = [] + self.daemon = parallel.ParallelHost() + self.sid = 0 + self.terminate = False + self.output = output + + self.set_refresh_time() + + def stop(self): + self.daemon.stop() + self.terminate = True + + def __del__(self): + self.stop() + + def log(self, *args, end='\n'): + self.output.write('[{0}]'.format(utils.str_time())) + for arg in args: + arg = str(arg) + self.output.write(arg) + self.output.write(end) + + def add(self, service, protected=False): + self.sid += 1 + service.output = self.output + self.services[self.sid] = service + if protected: + self.protected_service.append(self.sid) + return self.sid + + def delete(self, sid): + if sid in self.protected_service: + self.log('Can not delete protected service.') + return + if sid in self.services: + self.deleted_services[sid] = self.services[sid] + del self.services[sid] + else: + self.log('The sid [{0}] do not exist!'.format(sid)) + + def recover(self, sid): + if sid in self.deleted_services: + self.services[sid] = self.deleted_services[sid] + del self.deleted_services[sid] + else: + self.log('The sid [{0}] is not found recycle bin.'.format(sid)) + + def set_refresh_time(self, refresh_cron='* * * * *'): + def refresh(): + pass + refresh_service = service(refresh, cron=refresh_cron, name='refresh') + self.add(refresh_service, protected = True) + + def get_next(self): + next_sid = -1 + next_time = -1 + for sid in self.services: + service = self.services[sid] + if service.next_time < next_time or next_sid < 0: + next_sid = sid + next_time = service.next_time + return next_sid, next_time + + def loop(self): + while not self.terminate: + next_sid, next_time = self.get_next() + service = self.services[next_sid] + sleep_time = next_time - time.time() + if sleep_time > 0: + time.sleep(sleep_time) + self.log('Running service {0} (SID={1})'.format(service.name, next_sid)) + if next_sid in self.services: + service.run(self.daemon) + else: + self.log('the sheduled service wiil not run since it is canceled.') + + + # mode: background: return immidietly + # foreground: stuck here. + def start(self, mode='background'): + if mode == 'background': + self.daemon.add_job(self.loop, name='service main loop') + else: + self.loop() + +def get_service_console(manager, name='service'): + + con = console.console(name) + + def command_show(args): + print('Active services:') + for sid in manager.services: + print('SID: {0} | Name: {1}'.format(sid, manager.services[sid].name)) + print('Deleted services:') + for sid in manager.deleted_services: + print('SID: {0} | Name: {1}'.format(sid, manager.deleted_services[sid].name)) + + def command_add(args): + parser = argparse.ArgumentParser() + parser.add_argument('cron', type=str, help='A cron expr') + parser.add_argument('task', type=str, help='task to run, should be a valid command') + parser.add_argument('--name', '-n', type=str, default='command service', help='name of the task') + args = shlex.split(args) + args = parser.parse_args(args) + cron = args.cron + if not croniter.is_valid(cron): + print('Invalid cron expression.') + task = args.task + name = args.name + service_to_add = service(con.execute, args=[task], cron=cron, name=name) + manager.add(service_to_add) + + def command_delete(args): + sid = None + if args.isdigit(): + if int(args) in manager.services: + sid = int(args) + if sid is not None: + manager.delete(sid) + else: + print('command arugment \"{0}\" is not understood.'.format(args)) + + def command_recover(args): + sid = None + if args.isdigit(): + if int(args) in manager.deleted_services: + sid = int(args) + if sid is not None: + manager.recover(sid) + else: + print('command arugment \"{0}\" is not understood.'.format(args)) + + def command_run(args): + sid = None + if args.isdigit(): + if int(args) in manager.services: + sid = int(args) + if sid is not None: + manager.services[sid].run(dry=True) + else: + print('command arugment \"{0}\" is not understood.'.format(args)) + + def command_info(args): + line = None + if args != '': + if args.isdigit(): + line = int(args) + if line is None: + line = 10 + manager.output.last(line) + + def command_next(args): + next_sid, next_time = manager.get_next() + info = '' + indent = ' ' + info += 'Next Job: {0}'.format(manager.services[next_sid].name) + info += '\n{0}SID: {1}'.format(indent, next_sid) + info += '\n{0}Scheduled Running Time: {1}'.format(indent, utils.time2str(next_time)) + info += '\n{0}Remeaning Time: {1}s'.format(indent, utils.float2str(next_time-time.time())) + print(info) + + con.regist('show', command_show, help_info='Show all services.', alias=['ls']) + con.regist('run', command_run, help_info='Run a service.') + con.regist('info', command_info, help_info='Display service output log.') + con.regist('next', command_next, help_info='Next job to run.') + con.regist('add', command_add, help_info='Register a command as service.') + con.regist('delete', command_delete, help_info='Delete a service', alias=['del']) + con.regist('recover', command_recover, help_info='Recover a service.') + return con + + +if __name__ == '__main__': + def func1(output): + output.write('func1') + + def func2(output): + output.write('func2') + + def add(a, b): + print('{0} + {1} = {2}'.format(a, b, a+b)) + + def command_add(args): + numbers = args.split(' ') + a = float(numbers[0]) + b = float(numbers[1]) + add(a, b) + + log_screen = screen.VirtualScreen() + manager = ServiceManager(output=log_screen) + test1 = service(func1, cron='* * * * *', name='test1', managed_output=True) + test2 = service(func2, cron='* * * * *', name='test2', managed_output=True) + manager.add(test1) + manager.add(test2) + manager.start('background') + + con = get_service_console(manager) + master = console.console() + master.regist('service', con, help_info='service console') + master.regist('add', command_add, help_info='Add two numbers.') + master.interactive() diff --git a/lib/try.py b/lib/try.py new file mode 100644 index 0000000..64c44b3 --- /dev/null +++ b/lib/try.py @@ -0,0 +1,15 @@ +def func(a, b, c, time=0, work=1): + print('a:{0} b:{1} c:{2}'.format(a, b, c)) + print('time:{0} work:{1}'.format(time, work)) + +def funcwrap(func, kargs, kkargs): + func(*kargs, **kkargs) + + +kargs = [1, 2, 3] +kkargs = { + "time":1234, + "work":1232 +} + +funcwrap(func, kargs, kkargs) \ No newline at end of file diff --git a/lib/utils.py b/lib/utils.py new file mode 100644 index 0000000..733075b --- /dev/null +++ b/lib/utils.py @@ -0,0 +1,139 @@ +import pickle +import time +import os +import re +import platform + +def detect_platform(): + p = 'Unknown' + if platform.platform().find('Windows') != -1: + p = 'Windows' + elif platform.platform().find('Linux') != -1: + p = 'Linux' + return p + +def ensure_dir_exist(directory, show_info = True): + exist = os.path.isdir(directory) + if not exist: + print('directory', directory, ' not found, creating...') + os.mkdir(directory) + +def validateTitle(title): + rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' + new_title = re.sub(rstr, " ", title) # 替换为空格 + return new_title + +def list2csv(l): + csv = '' + for item in l: + csv += str(item) + ',' + csv = csv[:-1] + return csv + +def clean_text(string): + if string is None: + return '' + while '\n' in string: + string = string.replace('\n', ' ') + splits = clean_split(string) + string = '' + for split in splits: + string += split + ' ' + string = string[:-1] + return string + +def clean_split(string, delimiter=' '): + sub_strs = string.split(delimiter) + splits = [] + for sub_str in sub_strs: + if sub_str is not '': + splits.append(sub_str) + return splits + +def remove_blank_in_endpoint(string): + length = len(string) + + first_index = 0 + for i in range(length): + if is_blank(string[first_index]): + first_index += 1 + else: + break + + last_index = length - 1 + for i in range(length): + if is_blank(string[last_index]): + last_index -= 1 + else: + break + last_index += 1 + return string[first_index:last_index] + +def is_blank(ch): + blank_ch = [' ', '\t', '\n'] + if ch in blank_ch: + return True + else: + return False + +def dict_to_arrtibute_string(attributes): + string = '' + for key in attributes: + string += key + '=\"{0}\";'.format(str(attributes[key])) + return string + +def attribute_string_to_dict(attrs): + attr_dict = {} + for attr in attrs: + attr_dict[attr[0]] = attr[1] + return attr_dict + +def save_python_object(obj, save_path): + with open(save_path, 'wb') as file: + pickle.dump(obj, file) + +def load_python_object(path): + with open(path, 'rb') as file: + return pickle.load(file) + +def delete_n(string): + while '\n' in string: + string = string.replace('\n', ' ') + return string + +def remove_additional_blank(string): + words = string.split(' ') + string = '' + for word in words: + if word is not '': + string += word + ' ' + return string[:-1] + +def formal_text(text): + text = delete_n(text) + text = remove_additional_blank(text) + return text + +def float2str(f, precision=2): + f = str(f) + f_base = f[:f.find('.') + precision] + return f_base + +# ========== time realted operation ========== # + +def str_day(): + day = time.strftime("%Y-%m-%d", time.localtime()) + return day + +def time2str(t): + localtime = time.localtime(int(t)) + return str_time(localtime) + +def str_time(local_time = None): + if local_time is None: + local_time = time.localtime() + day = time.strftime("%Y-%m-%d-%Hh-%Mm-%Ss)", local_time) + return day + +if __name__ == '__main__': + print(str_day()) diff --git a/main.py b/main.py new file mode 100644 index 0000000..f057d0c --- /dev/null +++ b/main.py @@ -0,0 +1,40 @@ +import arxiv_bot +import email_sender +import subscriber_utils +import feeds +from lib import utils +import os +from lib import service +from lib.console import console +from lib import screen + +subscribe_manager = subscriber_utils.subscribe_manager() +# subscribe_manager.load() + +bot = arxiv_bot.arxiv_bot(subscribe_manager.get_subscribed_topics()) +feeds_generator = feeds.feed_manager(subscribe_manager, bot) +emailer = email_sender.arxiv_emailer(bot, feeds_generator, debug=False) + +log_screen = screen.VirtualScreen() +manager = service.ServiceManager(output=log_screen) + +daily_mail_service = service.service( + emailer.send_daily_email, + cron='0 4 * * 1-5', + name = 'send daily email' +) +manager.add(daily_mail_service) + +shell = console('ArxivBot') +def command_load(args): + if args == 'subscriber': + subscribe_manager.load() + +shell.regist('load', command_load, help_info='load config. (only subscriber supported till now)') +service_shell = service.get_service_console(manager, 'ServiceManager') +shell.regist('service', service_shell, help_info='service mamager') + +# cron time: +# min hour day month week +manager.start() +shell.interactive() diff --git a/subscriber_utils.py b/subscriber_utils.py new file mode 100644 index 0000000..3e12fb7 --- /dev/null +++ b/subscriber_utils.py @@ -0,0 +1,92 @@ +from lib.parser import dom_node, simple_parser + + +class subscribe_manager(): + def __init__(self, subscriber_config = './config/subscriber.xml'): + self.subscriber_config = None + self.subscribers = {} + if subscriber_config is not None: + self.subscriber_config = subscriber_config + self.load() + + def show(self): + if self.subscribers is None: + print('No subscriber found!') + else: + for name in self.subscribers: + print('Name:', name, 'Email:', self.subscribers[name]['email']) + + def load(self, path=None): + if path is None: + path = self.subscriber_config + if path is None: + return None + tree = None + with open(path, 'r') as f: + xml = f.read() + parser = simple_parser() + parser.feed(xml) + tree = parser.root + subscribers = {} + if tree is not None: + for person in tree.childs: + person_name = None + person_email = None + person_topics = [] + person_keywords = [] + for item in person.childs: + if item.name == 'name': + person_name = item.data + elif item.name == 'email': + person_email = item.data + elif item.name == 'topics': + for topic in item.childs: + if topic.name == 'topic': + person_topics.append(topic.data) + elif item.name == 'keywords': + for keyword in item.childs: + if keyword.name == 'keyword': + person_keywords.append(keyword.data) + if person_name is not None and person_email is not None and person_topics is not None: + subscriber = {} + subscriber['keywords'] = person_keywords + subscriber['email'] = person_email + subscriber['topics'] = person_topics + subscribers[person_name] = subscriber + self.subscribers = subscribers + print('Subscriber load success! All subscribers are shown below:') + self.show(); + + def get_subscribed_topics(self): + topics = [] + for name in self.subscribers: + subscriber = self.subscribers[name] + topics += subscriber['topics'] + topics = set(topics) + return topics + + def get_subscribed_keywords(self): + keywords = [] + for name in self.subscribers: + keywords += self.subscribers[name]['keywords'] + keywords = set(keywords) + return keywords + + def get_keywords_of_topics(self): + keywords_of_topics = {} + for name in self.subscribers: + subscriber = self.subscribers[name] + topic_group = subscriber['topics'] + for topic in topic_group: + if topic not in keywords_of_topics: + keywords_of_topics[topic] = [] + keywords_of_topics[topic] += subscriber['keywords'] + return keywords_of_topics + + +if __name__ == '__main__': + manager = subscribe_manager() + print(manager.subscribers) + print(manager.get_subscribed_topics()) + print(manager.get_subscribed_keywords()) + print(manager.get_keywords_of_topics()) \ No newline at end of file diff --git a/try.py b/try.py new file mode 100644 index 0000000..809cc6e --- /dev/null +++ b/try.py @@ -0,0 +1,21 @@ +import arxiv_service +import time + +now = arxiv_service.cron_time(time.localtime(time.time())) +# now.show() +# while True: +# now.next_day() +# now.show() + +# running time +# minute hour day month week year +# * means always. +# a-b means from a to b (a and b included) +# a means run at this time. +# must match all to execute a command. + +schedule = arxiv_service.cron_expr('0 0 29 2 * *') +for i in range(10): + now = schedule.next_run(now) + now.show() + print(now.to_struct_time()) \ No newline at end of file