initial commit

This commit is contained in:
mingyang 2019-12-23 12:42:31 +08:00
parent 30458acfd8
commit 1ac6d0bb9c
22 changed files with 2126 additions and 1 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
*.pyc
__pycache__/
cache/
feeds/
config/email_session.xml
config/subscriber.xml
config.py

View File

@ -1,2 +1,22 @@
# ArxivRobot
# What is This?
This is a naive and simple arxiv robot, it will fetch today's updated papers from arxiv in specified topic, filter the papers by given keywords, and send the result to a given email address.
# What i need to do to run this code?
## package requirements:
It seems you only need to install croniter: ```pip install croniter``` will do it.
## configuration:
1. create config.py, a sample is given in config-examples.py
2. create config/subscriber.xml, a sample is also given in /config/subscriber_example.xml
## run this code.
```python main.py```
If everything goes okay, you will see a shell interface, type help for more information.
# PS
I am really really a bad coder and not good at writing document and comments. If you have anything in trouble, feel free to open a issue and i will try my best to fix the problem.
The code is pushed in a hurry, i will add a document to explain this code when i have free time.

65
analysis_paper.py Normal file
View File

@ -0,0 +1,65 @@
from arxiv_spider import arxiv_paper
import utils
import numpy as np
authors = {}
years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
for year in years:
print('Analysising year:', year)
papers = utils.load_python_object('./feeds/' + year)
for paper in papers:
author_this_paper = paper.info['authors']
for author in author_this_paper:
author = utils.delete_n(author)
if author in authors:
authors[author] += 1
else:
authors[author] = 1
freq = []
names = []
for author in authors:
freq.append(authors[author])
names.append(author)
freq = np.asarray(freq, dtype=np.int32)
freq_sort = np.argsort(freq)
num_authors = len(names)
for i in range(num_authors):
aid = freq_sort[num_authors - i - 1]
print('Name: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8'))
# keywords = {}
# years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
# for year in years:
# print('Analysising year:', year)
# papers = utils.load_python_object('./feeds/' + year)
# for paper in papers:
# keyword_this_paper = paper.info['title'].split(' ')
# for keyword in keyword_this_paper:
# keyword = utils.delete_n(keyword).lower()
# if keyword in keywords:
# keywords[keyword] += 1
# else:
# keywords[keyword] = 1
# freq = []
# names = []
# for keyword in keywords:
# freq.append(keywords[keyword])
# names.append(keyword)
# freq = np.asarray(freq, dtype=np.int32)
# freq_sort = np.argsort(freq)
# num_keywords = len(names)
# for i in range(num_keywords):
# aid = freq_sort[num_keywords - i - 1]
# print('Keyword: {0} | papers: {1}'.format(names[aid], freq[aid]).encode('utf-8'))

85
arxiv_bot.py Normal file
View File

@ -0,0 +1,85 @@
import arxiv_spider
import os
import time
from lib import utils
# cache tree:
# cache_root
# - topic-caches
# - feed_$(time).arxiv_feed
# - feed_year_$(year).arxiv_feed
class arxiv_bot():
def __init__(self, topics, cache_dir='./cache', arxiv_site='https://arxiv.org', log=False):
self.log = log
self.site = arxiv_site
self.topics = []
self.spiders = {}
self.cache_dir = cache_dir
self.topic_caches = {}
if not os.path.isdir(self.cache_dir):
os.makedirs(self.cache_dir)
self.update_topics(topics)
def update_topics(self, topics):
for topic in topics:
if topic not in self.topics:
self.topics.append(topic)
if self.log:
print('Adding topic {0}.'.format(topic))
topic_cache = os.path.join(self.cache_dir, topic)
self.topic_caches[topic] = topic_cache
if not os.path.isdir(topic_cache):
if self.log:
print('creating topic dir:', topic_cache)
os.makedirs(topic_cache)
self.spiders[topic] = arxiv_spider.arxiv_spider(topic, self.site)
# load feed if it is already downloaded. If not, use spiders to get today's feed.
def get_today_feed(self):
today_feed = {}
today = utils.str_day()
for topic in self.topics:
today_feed_name = 'feed_' + today + '.arxiv_daily_feed'
today_feed_path = os.path.join(self.cache_dir, topic, today_feed_name)
cache_dir = self.topic_caches[topic]
topic_feed = None
if os.path.exists(today_feed_path):
topic_feed = utils.load_python_object(today_feed_path)
else:
topic_feed = self.spiders[topic].get_today_paper()
print('Fetching topic {0} papers...'.format(topic))
for paper in topic_feed:
if self.log:
print('download abstract for paper', paper.info['title'])
paper.download_abstract()
utils.save_python_object(topic_feed, today_feed_path)
today_feed[topic] = topic_feed
return today_feed
def get_interested_paper(self, topic, keywords):
if self.today_feed is None or utils.str_day() is not self.today:
self.today_feed = self.get_today_feed()
self.today = utils.str_day()
print('Updating daily feed.')
topic_feed = self.today_feed[topic]
topic_papers = []
for day in topic_feed:
topic_papers += topic_feed[day]
strong = []
weak = []
for paper in topic_papers:
strong_match = False
weak_match = False
for keyword in keywords:
if paper.info['title'].lower().find(keyword) != -1:
strong_match = True
break
elif paper.info['abstract'].lower().find(keyword) != -1:
weak_match = True
if strong_match:
strong.append(paper)
elif weak_match:
weak.append(paper)
return strong, weak

31
arxiv_service.py Normal file
View File

@ -0,0 +1,31 @@
import arxiv_bot
import feeds
import email_sender
import time
import subscriber_utils
import utils
from croniter import croniter
import threading
class test_service():
def __init__(self, name):
self.name = name
pass
def do(self):
print('Job {0} run!'.format(self.name))
class mail_service():
def __init__(self, emailer):
self.emailer = emailer
def do(self):
self.emailer.send_daily_email()
class reload_subscriber():
def __init__(self, subscriber_mgr):
self.mgr = subscriber_mgr
def do(self):
self.mgr.load()

369
arxiv_spider.py Normal file
View File

@ -0,0 +1,369 @@
import requests
import pickle
import time
from lib import utils
from lib.parser import dom_node, simple_parser
import socket
import socks
use_proxy = False
if use_proxy:
SOCKS5_PROXY_HOST = '127.0.0.1'
SOCKS5_PROXY_PORT = 1080
default_socket = socket.socket
socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
socket.socket = socks.socksocket
class arxiv_paper():
def __init__(self, arxiv_id = None, paper_info = None):
self.arxiv_id = arxiv_id
self.info = paper_info
def add_author(self, author):
self.info['authors'].append(authors)
def title(self):
return self.info['title']
def describe(self):
information = ''
information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id)
for key in self.info:
if self.info[key] is not None:
info = utils.formal_text(self.info[key])
information += ('\t' + key + ':' + str(info) + '\n')
return information
def show(self):
print(self.describe())
def to_html(self):
dom_tree = dom_node(name = 'paper-section')
paper_title = None
paper_link = None
paper_authors = None
paper_comments = None
paper_subjects = None
paper_abstract = None
for key in self.info:
if self.info[key] is not None:
if key == 'title':
paper_title = dom_node('paper-title')
link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)}
link_node = dom_node('a', link_attr)
link_node.data = self.info[key]
paper_title.add_child(link_node)
paper_link = dom_node('paper-pdf-link')
pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)}
pdf_link = dom_node('a', pdf_link_attr)
pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id)
paper_link.add_child(pdf_link)
elif key == 'authors':
paper_authors = dom_node('paper-authors')
authors_string = ''
for author in self.info[key]:
authors_string += author + ', '
authors_string = authors_string[:-2]
paper_authors.data = authors_string
elif key == 'comments':
paper_comments = dom_node('paper-comments')
paper_comments.data = self.info[key]
elif key == 'subjects':
paper_subjects = dom_node('paper-subjects')
paper_subjects.data = self.info[key]
elif key == 'abstract':
paper_abstract = dom_node('paper-abstract')
paper_abstract.data = self.info[key]
dom_tree.add_child(paper_title)
dom_tree.add_child(paper_link)
dom_tree.add_child(paper_authors)
dom_tree.add_child(paper_abstract)
dom_tree.add_child(paper_comments)
dom_tree.add_child(paper_subjects)
html = dom_tree.to_string()
return html
def download_abstract(self, forcemode=False):
if not forcemode:
if self.info['abstract'] is not None:
# print('skipping download abstract since already downloaded')
return;
r = requests.get('https://arxiv.org/abs/' + self.arxiv_id)
parser = simple_parser()
parser.feed(r.text)
tree = parser.root
meta_nodes = tree.search('meta')
for meta_node in meta_nodes:
meta_attr = meta_node.attributes
if 'property' in meta_attr:
if meta_attr['property'] == 'og:description':
self.info['abstract'] = utils.formal_text(meta_attr['content'])
return;
class arxiv_list_parser():
def __init__(self, html_page):
self.html_page = html_page
self.parser = simple_parser()
self.parser.feed(html_page)
self.tree = self.parser.root
def get_arxiv_id(self, dt_node):
if len(dt_node.childs) == 0:
return None
else:
arxiv_id = dt_node.childs[1].childs[0].attributes['href']
arxiv_id = arxiv_id.split('/')[-1]
return arxiv_id
def get_paper_info(self, dd_node):
title = None
authors = []
comments = None
subjects = None
if len(dd_node.childs) == 0:
return None
else:
elements = dd_node.childs[0].childs
for element in elements:
if 'class' in element.attributes:
element_class = element.attributes['class']
if element_class == 'list-title mathjax':
title = utils.formal_text(element.data)
elif element_class == 'list-authors':
for child in element.childs:
if child.name == 'a':
authors.append(utils.formal_text(child.data))
elif element_class == 'list-comments mathjax':
comments = utils.formal_text(element.data)
elif element_class == 'list-subjects':
subjects = utils.formal_text(element.data)
paper_info = {
'title':title,
'authors':authors,
'comments':comments,
'subjects':subjects,
'abstract':None
}
return paper_info
def get_papers(self):
dts = self.tree.search('dt')
dds = self.tree.search('dd')
papers = []
for dt, dd in zip(dts, dds):
arxiv_id = self.get_arxiv_id(dt)
if arxiv_id == None:
continue;
paper_info = self.get_paper_info(dd)
if paper_info == None:
continue;
paper = arxiv_paper(arxiv_id, paper_info)
papers.append(paper)
return papers
def get_paper_num(self):
totally_paper_node = self.tree.search('small')[0].data
total_num_split = totally_paper_node.split(' ')
num_total = 0
for split in total_num_split:
if split.isdigit():
num_total = int(split)
break;
return num_total
def get_recent_info(self):
# get each day start id and day_name
day_name = []
day_start = []
li_nodes = self.tree.search('ul')[0].childs
for li in li_nodes:
link = li.childs[0].attributes['href']
start = None
if link.find('#item') != -1:
start = link.split('#')[-1][4:]
else:
start = link.split('=')[-2].split('&')[0]
day_name.append(li.childs[0].data)
day_start.append(int(start))
# get total paper num
num_total = self.get_paper_num()
# get each day num.
num_days = len(day_start)
day_num = []
for i in range(num_days):
if i < num_days - 1:
day_num.append(day_start[i+1] - day_start[i])
else:
day_num.append(num_total - day_start[i])
# generate final info.
recent_papers_info = {}
for day, start, num in zip(day_name, day_start, day_num):
current_day_info = {}
current_day_info['start'] = start
current_day_info['num'] = num
recent_papers_info[day] = current_day_info
return recent_papers_info
class arxiv_spider():
def __init__(self, topic, arxiv_url = 'https://arxiv.org'):
self.link = arxiv_url
self.topic = topic
self.base_url = self.link + '/list/' + self.topic
def get_yearly_papers(self, year, log=False):
yearly_url = self.base_url + '/' + year
if log:
print('visiting url [{0}] for basic information'.format(yearly_url))
r = requests.get(yearly_url)
list_parser = arxiv_list_parser(r.text)
total_num = list_parser.get_paper_num()
print('Total Number for this year:', total_num)
yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num)
if log:
print('visiting url [{0}] for all papers'.format(yearly_url_all))
r = requests.get(yearly_url_all)
list_parser = arxiv_list_parser(r.text)
yearly_papers = list_parser.get_papers()
return yearly_papers
# papers:
# papers = {
# 'key is day string': [content is a list of arxiv_paper class]
# }
def get_papers_on_search_list(self, search_url, log=True):
if log:
print('visiting url [{0}] for today papers.'.format(search_url))
search_content = requests.get(search_url)
search_content = search_content.text
parser = simple_parser()
parser.feed(search_content)
tree = parser.root
paper_nodes = tree.search('entry')
print('num_searched_nodes:', len(paper_nodes))
papers = []
for node in paper_nodes:
arxiv_id = node.search('id')[0].data.split('/')[-1]
title = node.search('title')[0].data
author_nodes = node.search('name')
authors = [item.data for item in author_nodes]
category_nodes = node.search('category')
categories = [item.attributes['term'] for item in category_nodes]
subjects = ''
for cat in categories:
subjects += cat + ','
subjects = subjects[:-1]
comments_node = node.search('arxiv:comment')
if len(comments_node) == 0:
comments = ''
else:
comments = node.search('arxiv:comment')[0].data
abstract = node.search('summary')[0].data
title = utils.formal_text(title)
subjects = utils.formal_text(subjects)
comments = utils.formal_text(comments)
abstract = utils.formal_text(abstract)
paper_info = {
'title':title,
'authors':authors,
'comments':comments,
'subjects':subjects,
'abstract':abstract
}
paper = arxiv_paper(arxiv_id, paper_info)
papers.append(paper)
return papers
def get_papers_by_ids(self, ids, log=True):
num_groups = int((len(ids) + 9.1)/10)
if log:
print('spliting into {0} groups.'.format(num_groups))
papers = []
for i in range(num_groups):
this_batch = ids[i * 10:(i+1)*10]
id_list = ''
for paper_id in this_batch:
id_list += paper_id + ','
id_list = id_list[:-1]
search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list
batch_papers = self.get_papers_on_search_list(search_url, log)
papers += batch_papers
return papers
def get_today_ids(self, log=True):
rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic)
if log:
print('visiting url [{0}] for today papers id.'.format(rss_url))
rss_content = requests.get(rss_url)
rss_content = rss_content.text
parser = simple_parser()
parser.feed(rss_content)
rss = parser.root
id_nodes = rss.search('rdf:li')
paper_ids = []
for node in id_nodes:
paper_link = node.attributes['rdf:resource']
paper_id = paper_link.split('/')[-1]
paper_ids.append(paper_id)
print('num_paper_ids:', len(paper_ids))
return paper_ids
def get_today_paper(self, return_day_name=False, log=True):
today_ids = self.get_today_ids(log)
papers = self.get_papers_by_ids(today_ids)
print('num of papers:', len(papers))
return papers
def get_today_paper_backup(self, return_day_name=False):
papers = self.get_recent_papers(recent_days=[1])
today = None
paper = None
for day in papers:
today = day
paper = papers[day]
if return_day_name:
return paper, today
else:
return paper
def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False):
recent_url = self.base_url + '/recent'
if log:
print('visiting url [{0}] for basic information'.format(recent_url))
r = requests.get(recent_url)
list_parser = arxiv_list_parser(r.text)
recent_papers_info = list_parser.get_recent_info()
print('paper info:', recent_papers_info)
day_id = 1
papers = {}
for day in recent_papers_info:
if day_id in recent_days:
today_start = recent_papers_info[day]['start']
today_num = recent_papers_info[day]['num']
page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num)
day_url = self.base_url + page_url
if log:
print('visiting url [{0}] for paper on day {1}'.format(day_url, day))
r = requests.get(day_url)
list_parser = arxiv_list_parser(r.text)
today_papers = list_parser.get_papers()
papers[day] = today_papers
day_id += 1
return papers

9
config-examples.py Normal file
View File

@ -0,0 +1,9 @@
#### email related config ####
username = 'email@email.com' # send email using this account
password = 'yourpassword' # your email login passoword
sender_name = 'ArxivRobot' # the name of your robot
replyto = 'yourmail@mail.com' # all replay email will be foreward to this email address
smtp_ssl_addr = 'smtp.smtp.com'
# smtp server, only ssl supported. you can support more by editing function send_main in email_sender.py

109
config/style.css Normal file
View File

@ -0,0 +1,109 @@
<head>
<style type="text/css">
paper-group
{
float: left;
width: 100%;
padding-top: 10px;
padding-bottom: 10px;
text-align: center;
color: white;
background-color: #3f3f3f;
font-family:Arial,Helvetica,sans-serif;
font-size: 2.0em;
border-radius:10px;
}
/*abstract
{
color:black;
text-align:center;
background-color:#d0e4fe;
}
*/
paper-section {
float: left;
margin-top: 20px;
margin-bottom: 20px;
margin-left: 10%;
margin-right: 10%;
width: 80%;
/*border: 1px solid green;
background-color: lightgrey;*/
}
paper-title {
float: left;
width: 100%;
/*border: 1px solid green;
background-color: lightgrey;*/
font-family:Arial,sans-serif;
font-size: 1.5em;
font-weight: bold;
color: black;
}
paper-authors {
float: left;
width: 90%;
/*border: 1px solid green;
background-color: lightgrey;*/
font-family:Arial,sans-serif;
font-size: 1.0em;
color: #900;
}
paper-pdf-link {
float: right;
width: 10%;
text-align: right;
/*border: 1px solid green;
background-color: lightgrey;*/
font-family:Arial,sans-serif;
font-size: 1.0em;
color: #900;
}
paper-abstract {
float: left;
width: 100%;
padding-top: 10px;
padding-bottom: 10px;
padding-left: 10px;
padding-right: 10px;
background-color: #EEFFEE;
font-family:Arial,sans-serif;
font-size: 1.2em;
color: black;
text-align: justify;
border-radius:10px;
}
paper-comments {
float: left;
width: 50%;
font-family:Arial,sans-serif;
font-size: 1.0em;
color: black;
word-break:break-all;
}
paper-subjects {
float: right;
text-align: right;
width: 50%;
font-family:Arial,sans-serif;
font-size: 1.0em;
color: black;
word-break:break-all;
}
a:link { text-decoration: none}
a:active { text-decoration:blink}
a:hover { text-decoration:underline}
a:visited { text-decoration: none}
a {
color: black;
}
</style>
</head>

View File

@ -0,0 +1,27 @@
<subscriber>
<name>name1</name>
<email>mail1@mail.com</email>
<topics>
<topic>cs.CV</topic>
<topic>cs.LG</topic>
<topic>stat.ML</topic>
</topics>
<keywords>
<keyword>keyword1</keyword>
<keyword>keyword2</keyword>
</keywords>
</subscriber>
<subscriber>
<name>name2</name>
<email>mail2@mail.com</email>
<topics>
<topic>cs.CV</topic>
<topic>cs.LG</topic>
<topic>stat.ML</topic>
</topics>
<keywords>
<keyword>keyword1</keyword>
<keyword>keyword2</keyword>
</keywords>
</subscriber>

10
download_html.py Normal file
View File

@ -0,0 +1,10 @@
import requests
from html.parser import HTMLParser
# r = requests.get('https://arxiv.org/list/cs.CV/recent')
r = requests.get('https://arxiv.org/list/cs.CV/recent')
# r = requests.get('http://xxx.itp.ac.cn/list/cs.CV/recent')
# r = requests.get('https://arxiv.org/list/cs.CV/pastweek?skip=25&show=25')
# print(r.status_code)
print(r.text)

138
email_sender.py Normal file
View File

@ -0,0 +1,138 @@
from lib.parser import dom_node, simple_parser
from lib import parser
from lib import utils
import os
import config
import smtplib
import email
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email.header import Header
from email import generator
def send_mail(reciver, title, content):
# with open('email.html', 'w', encoding="utf-8") as f:
# f.write(content)
# 20:24
username = config.username
password = config.password
replyto = config.replyto
msg = MIMEMultipart('alternative')
msg['Subject'] = Header(title)
msg['From'] = '%s <%s>' % (Header(config.sender_name), username)
msg['To'] = reciver
msg['Reply-to'] = replyto
msg['Message-id'] = email.utils.make_msgid()
msg['Date'] = email.utils.formatdate()
texthtml = MIMEText(content, _subtype='html', _charset='UTF-8')
msg.attach(texthtml)
# with open('email.eml', 'w') as outfile:
# gen = generator.Generator(outfile)
# gen.flatten(msg)
try:
client = smtplib.SMTP_SSL(config.smtp_ssl_addr)
# client.connect('smtpdm-ap-southeast-1.aliyun.com', 80)
client.set_debuglevel(0)
client.login(username, password)
client.sendmail(username, reciver, msg.as_string())
client.quit()
print ('Email send to {0} success!'.format(reciver))
return True
except smtplib.SMTPConnectError as e:
print ('Connection Error:', e.smtp_code, e.smtp_error)
except smtplib.SMTPAuthenticationError as e:
print ('Authentication Error:', e.smtp_code, e.smtp_error)
except smtplib.SMTPSenderRefused as e:
print ('Sender Refused:', e.smtp_code, e.smtp_error)
except smtplib.SMTPRecipientsRefused as e:
print ('SMTPRecipients Refused:', e.smtp_code, e.smtp_error)
except smtplib.SMTPDataError as e:
print ('Data Error:', e.smtp_code, e.smtp_error)
except smtplib.SMTPException as e:
print ('SMTPException:', e.message)
except Exception as e:
print ('Unknown error:', str(e))
return True
class arxiv_emailer():
def __init__(self, arxiv_bot, feeds_generator, session_file = './config/email_session.xml', debug=False):
self.debug = debug
self.email_info = dom_node()
self.session_file = session_file
self.sessions = None
if self.session_file is not None:
self.load_session()
self.bot = arxiv_bot
self.feeds = feeds_generator
def send_daily_email(self):
emails = self.feeds.generate_daily_emails()
today = utils.str_day()
for name in emails:
email = emails[name]
send = False
if name not in self.sessions:
print('New user found!')
self.sessions[name] = {}
self.sessions[name]['last-send'] = today
send = True
if self.sessions[name]['last-send'] != today:
send = True
if send:
print('Sending email to user {0} [{1}]'.format(name, email['reciver']))
print('reciver:', email['reciver'])
print('title:', email['title'])
print('content:', len(email['content']))
success = False
if not self.debug:
success = send_mail(email['reciver'], email['title'], email['content'])
if success:
self.sessions[name]['last-send'] = today
self.save_session()
else:
print('skipping user {0} since already sent!'.format(name))
def load_session(self, session_file=None):
if session_file is None:
session_file = self.session_file
tree = None
with open(session_file, 'r') as f:
xml = f.read()
xmlparser = simple_parser()
xmlparser.feed(xml)
tree = xmlparser.root
sessions = parser.dom2dict(tree)
if 'root' in sessions:
sessions = sessions['root']
else:
sessions = {}
self.sessions = sessions
print(self.sessions)
return sessions
def save_session(self, session_file=None):
if session_file is None:
session_file = self.session_file
if session_file is None:
return None
xml = parser.dict2dom(self.sessions).to_string()
with open(session_file, 'w') as f:
f.write(xml)
return xml
if __name__ == '__main__':
emailer = arxiv_emailer(None, None, None)
emailer.send_daily_email()
print(emailer.load_session())
print(emailer.save_session())

103
feeds.py Normal file
View File

@ -0,0 +1,103 @@
from lib.parser import dom_node
from lib import utils
class feed_manager():
def __init__(self, submgr, arxivbot, style='./config/style.css'):
self.style_path = style
self.style = ''
self.bot = arxivbot
self.submgr = submgr
self.update_style()
def update_style(self, path = None):
if path is None:
path = self.style_path
print('loading style from:', path)
with open(path, 'r') as f:
self.style = f.read()
self.style += '\n'
def fetch_today_feed(self):
self.today_feed = self.bot.get_today_feed()
def filter_papers_for_user(self, subscriber):
strong_papers = []
weak_papers = []
keywords = subscriber['keywords']
papers = []
for topic in subscriber['topics']:
if topic in self.today_feed:
papers += self.today_feed[topic]
else:
print('Warning: topic {0} is subscribed but not downloaded!'.format(topic))
known_ids = []
unique_papers = []
for paper in papers:
paper_id = paper.arxiv_id
if paper_id not in known_ids:
unique_papers.append(paper)
known_ids.append(paper_id)
print('removing {0} repeated papers.'.format(len(papers) - len(unique_papers)))
papers = unique_papers
for paper in papers:
strong = False
weak = False
for keyword in keywords:
if paper.info['title'].lower().find(keyword) != -1:
strong = True
break;
elif paper.info['abstract'].lower().find(keyword) != -1:
weak = True
if strong:
strong_papers.append(paper)
elif weak:
weak_papers.append(paper)
return strong_papers, weak_papers
def generate_group_feed(self, paper_groups):
group_html = ''
for key in paper_groups:
header = dom_node('paper-group')
header.data = key
group_html += header.to_string() + '\n'
for paper in paper_groups[key]:
group_html += paper.to_html() + '\n'
return group_html
def generate_daily_feed_by_matched_paper(self, strong_interested, weak_interested):
feeds = {}
if len(strong_interested) > 0:
feeds['Strong Interested Paper'] = strong_interested
if len(weak_interested) > 0:
feeds['Weak Interested Paper'] = weak_interested
xml_feed = self.generate_group_feed(feeds)
return xml_feed
def generate_daily_email_by_matched_paper(self, strong_interested, weak_interested):
xml_feed = self.generate_daily_feed_by_matched_paper(strong_interested, weak_interested)
email_content = ''
if xml_feed != '':
email_content = self.style + xml_feed
return email_content
def generate_daily_emails(self):
self.fetch_today_feed()
emails = {}
# email is a dict, containing title, reciver and content.
today = utils.str_day()
for name in self.submgr.subscribers:
subscriber = self.submgr.subscribers[name]
strong, weak = self.filter_papers_for_user(subscriber)
content = self.generate_daily_email_by_matched_paper(strong, weak)
reciver = subscriber['email']
if content == '':
print('Skipping user {0} [{1}] since no paper matched.'.format(name, reciver))
continue;
title = "Your Interested Paper On Arxiv Today ({0})".format(today)
email = {}
email['reciver'] = reciver
email['title'] = title
email['content'] = content
emails[name] = email
return emails

304
lib/console.py Normal file
View File

@ -0,0 +1,304 @@
from . import utils
import os
import traceback
class console():
def __init__(self, name='base'):
self.name = name
self.hint = '$ '
self.exit_cmd = ['exit', 'quit', 'bye']
self.exit_info = 'Bye~'
self.commands = {}
self.alias = {}
self.warn_level = 4
self.exit_flag = False
self.debug = True
self.platform = utils.detect_platform()
self.is_child = False
self.father = None
self.regist_internal_command()
def get_hint(self):
if self.platform == 'Linux':
hint = '\033[0;33m({0})\033[0;31m{1}\033[0m'.format(self.name, self.hint)
else:
hint = '({0}){1}'.format(self.name, self.hint)
return hint
def regist_internal_command(self):
self.regist(
'help',
action=self.command_help,
alias=['h'],
help_info='display this help info.',
kind='sys'
)
self.regist(
'exit',
action=self.command_exit_console,
alias=['quit','bye'],
help_info='exit current console.',
kind='sys'
)
self.regist(
'cls',
action=self.command_clear_screen,
alias=['clear', 'clc'],
help_info='clear screen.',
kind='sys'
)
self.regist(
'alias',
action=self.command_alias,
help_info='display alias info or create new alias.',
kind='sys'
)
self.regist(
'os',
action=self.command_os,
help_info='run a system command.',
kind='sys'
)
def translate_command(self, command):
while command in self.alias and command not in self.commands:
command = self.alias[command]
return command
def find_equal_command(self, command, ret_type = str, ignored = []):
finished = []
new = []
cmds = [command]
while len(finished) != len(cmds):
# find child
if command in self.alias:
if self.alias[command] not in cmds:
cmds.append(self.alias[command])
# find fathers
for al in self.alias:
if self.alias[al] == command:
if al not in cmds:
cmds.append(al)
# found finished.
finished.append(command)
for cmd in cmds:
if cmd not in finished:
command = cmd
if ret_type is str:
finished = utils.list2csv(finished)
return finished
def get_alias(self, command, ret_type=str):
alias = []
for al in self.alias:
if self.alias[al] == command:
alias.append(al)
if ret_type is str:
alias = utils.list2csv(alias)
return alias
def command_exist(self, command):
if command in self.commands or command in self.alias:
return True
else:
return False
def add_alias(self, command, alias):
if self.command_exist(alias):
if warn_level >= 3:
print('Alias {0} will not be added since already used'.format(al))
else:
self.alias[alias] = command
# kind: standard or shared
# standard: help info will be displayed
# shared: help info will not be displayed in sub command.
def regist(self, command, action, alias=None, help_info='no help provided.', kind='standard'):
if type(action) == console:
action.is_child = True
action.father = self
exist = self.command_exist(command)
if exist:
if self.warn_level >=3:
print('Command {0} will not be added sinece already exist.'.format(command))
return
if type(alias) is list:
for al in alias:
self.add_alias(command, al)
elif type(alias) is str:
self.add_alias(command, alias)
elif alias is None:
pass
else:
if self.warn_level > 3:
print('Unknown alias type, no alias will be added.')
self.commands[command] = {}
self.commands[command]['action'] = action
self.commands[command]['help'] = help_info
self.commands[command]['kind'] = kind
def handle_command(self, command, args):
if command in self.commands:
act = self.commands[command]['action']
try:
act(args)
except KeyboardInterrupt:
pass
except:
print('Exception occured while processing command \"{0} {1}\".'.format(command, args))
print('More information are shown below.\n', traceback.format_exc())
else:
print('Unknown command \"{0}\"'.format(command))
# seperate command and its args.
def parse_command(self, string):
string += ' '
length = len(string)
command_end = 0
parse_start = False
for i in range(length):
blank = utils.is_blank(string[i])
if not blank:
parse_start=True
if parse_start and blank:
command_end = i
break
command = string[:command_end]
command = utils.remove_blank_in_endpoint(command)
args = utils.remove_blank_in_endpoint(string[command_end:])
return command, args
def parse(self, string):
command, args = self.parse_command(string)
exitsted_commands = []
while command in self.alias:
if command not in exitsted_commands:
exitsted_commands.append(command)
command = self.alias[command]
string = command + ' ' + args
command, args = self.parse_command(string)
else:
break
return command, args
def show_help_info(self, command, prefix, indent, depth=0):
command = self.translate_command(command)
action = self.commands[command]['action']
kind = self.commands[command]['kind']
if kind == 'sys' and depth > 0:
return
alias = self.get_alias(command, ret_type=str)
if alias != '':
print('{0}{1}({2}):'.format(prefix, command, alias))
else:
print('{0}{1}:'.format(prefix, command))
print('{0}{1}{2}'.format(prefix, indent, self.commands[command]['help']))
if type(action) == console:
action.command_help('', prefix=prefix+indent, indent=indent, depth=depth+1)
def debug_log(self, command, args):
if self.debug:
print('command:[{0}] args:[{1}]'.format(command, args))
def command_exit_console(self, args):
if not self.is_child:
print(self.exit_info)
self.exit_flag = True
def command_clear_screen(self, args):
if self.platform == 'Windows':
os.system('cls')
elif self.platform == 'Linux':
os.system('clear')
return False
def command_help(self, args, prefix = '', indent=' ', depth=0):
command, args = self.parse_command(args)
if command is not "":
if self.command_exist(command):
self.show_help_info(command, prefix, indent, depth)
else:
print('Unknown command \"{0}\"'.format(command))
else:
for command in self.commands:
self.show_help_info(command, prefix, indent, depth)
def command_alias(self, args):
alias_parse = args.split('=')
if len(alias_parse) == 2:
alias = utils.remove_blank_in_endpoint(alias_parse[0])
command = utils.remove_blank_in_endpoint(alias_parse[1])
if command is not '':
self.alias[alias]=command
else:
del self.alias[alias]
elif args == '':
for alias in self.alias:
print('{0}={1}'.format(alias, self.alias[alias]))
elif len(alias_parse) == 1:
if args in self.alias:
print('{0}={1}'.format(args, self.alias[args]))
equal_alias = self.find_equal_command(args)
if equal_alias != '':
print('Hint: {0} are all equivalent.'.format(equal_alias))
elif args in self.commands:
als = self.get_alias(args, ret_type=str)
if als == '':
print('command {0} has no alias.'.format(args))
else:
print('command {0} is aliased as {1}'.format(args, als))
equal_alias = self.find_equal_command(args)
if equal_alias != '' and equal_alias != args:
print('Hint: {0} are all equivalent.'.format(equal_alias))
else:
print('No alias \"{0}\" found.'.format(args))
else:
print('Syntax error, command not understood.')
def command_os(self, args):
if args == '':
print('please specify os command')
else:
os.system(args)
def execute(self, string):
command, args = self.parse(string)
if command is not "":
self.handle_command(command, args)
def __call__(self, args):
if args != '':
self.execute(args)
else:
self.exit_flag=False
self.interactive()
def interactive(self):
while not self.exit_flag:
try:
input_str = input(self.get_hint())
self.execute(input_str)
except(KeyboardInterrupt):
print('')
if __name__ == '__main__':
con = console()
con_sub = console()
con_sub_sub = console()
con_sub.regist('test_subsubcommand', con_sub_sub, alias='tss', help_info='A sub command.')
con.regist('test_subcommand', con_sub, alias='ts', help_info='A sub command.')
con.interactive()

127
lib/parallel.py Normal file
View File

@ -0,0 +1,127 @@
import threading
import queue
import time
class Job():
def __init__(self, func, args=[], kwargs={}, name=None):
if name == None:
name = 'job'
self.id = None
self.name = name
self.func = func
self.args = args
self.kwargs = kwargs
self.results = None
def run(self):
self.results = self.func(*self.args, **self.kwargs)
def set_name(self, name):
self.name = name
def set_id(self, jid):
self.id = jid
def __call__(self):
self.run()
class Worker(threading.Thread):
def __init__(self, work_queue, finished_queue):
super(Worker, self).__init__()
self.queue = work_queue
self.finished = finished_queue
self.terminate = False
self.daemon=True
def stop(self):
self.terminate = True
def run(self):
while not self.terminate:
try:
task = self.queue.get(timeout=1)
task.run()
self.queue.task_done()
self.finished.put(task)
except queue.Empty:
pass
except KeyboardInterrupt:
print("you stop the threading")
class ParallelHost():
def __init__(self, num_threads=8):
self.num_threads = num_threads
self.workers = []
self.tasks = queue.Queue()
self.results = queue.Queue()
self.rets = {}
self.id = 0
for i in range(self.num_threads):
worker = Worker(self.tasks, self.results)
self.workers.append(worker)
for worker in self.workers:
worker.start()
def __del__(self):
self.stop('kill')
# soft stop: wait until all job done
# hard stop: stop even with unfinished job
# kill stop: whatever the thread is doing, exit.
def stop(self, mode='soft'):
print('Trying to stop.')
if mode == 'soft':
self.tasks.join()
print('All job finished.')
for worker in self.workers:
worker.stop()
if mode == 'kill':
worker.join(0.01)
def commit(self, job):
self.id += 1
job.set_id(self.id)
self.tasks.put(job)
return self.id
def add_job(self, func, args=[], kwargs={}, name=None):
job = Job(func, args, kwargs, name)
return self.commit(job)
def collect_all(self):
while not self.results.empty():
task = self.results.get()
jid = task.id
self.rets[jid] = task.results
def get_result(self, jid, block=False):
if jid in self.rets:
ret = self.rets[jid]
del self.rets[jid]
return ret
while True:
if self.results.empty() and not block:
break
task = self.results.get()
if task.jid == jid:
return task.results
else:
self.rets[task.jid] = task.results
def clear_results(self):
while not self.results.empty():
self.results.get()
self.rets = {}
if __name__ == '__main__':
host = ParallelHost()
def loop_print(info, num):
for i in range(num):
print(info + ':' + str(i))
time.sleep(1)
for i in range(10):
host.add_job(loop_print, ["loop_print_{0}".format(i), 5])
host.terminate('kill')

151
lib/parser.py Normal file
View File

@ -0,0 +1,151 @@
from html.parser import HTMLParser
from . import utils
def dict_to_arrtibute_string(attributes):
string = ''
for key in attributes:
string += key + '=\"{0}\";'.format(str(attributes[key]))
return string
def attribute_string_to_dict(attrs):
attr_dict = {}
for attr in attrs:
attr_dict[attr[0]] = attr[1]
return attr_dict
class dom_node():
def __init__(self, name = None, attributes = {}):
if name is not None:
self.name = name
else:
self.name = 'Node'
self.attributes = attributes
self.childs = []
self.data = None
self.father = None
def add_child(self, child):
if child is not None:
child.father = self
self.childs.append(child)
def to_string(self, prefix='', indent=' '):
string = prefix + '<' + self.name
if self.attributes:
string += ' ' + dict_to_arrtibute_string(self.attributes)
string += '>\n'
for child in self.childs:
string += child.to_string(prefix=prefix+indent, indent=indent)
if self.data is not None:
string += prefix + indent + self.data + '\n'
string += prefix + '</{0}>\n'.format(self.name)
return string
def has_child(self, name):
has = False
for child in self.childs:
if child.name == name:
has = True
break;
return has
def search(self, name):
founded_node = []
if type(name) is list:
if self.name in name:
founded_node.append(self)
else:
if self.name == name:
founded_node.append(self)
for child in self.childs:
search_result = child.search(name)
founded_node += search_result
return founded_node
def dict2dom(d, root_name='root'):
node = dom_node(root_name)
for key in d:
elem = d[key]
child_node = dom_node(name=str(key))
if type(elem) is dict:
child_node = dict2dom(elem, root_name=str(key))
elif type(elem) is list:
for subelem in elem:
if type(subelem) is dict:
sub_node = dict2dom(subelem, root_name='li')
child_node.add_child(sub_node)
else:
sub_node = dom_node('li')
sub_node.data = str(subelem)
child_node.add_child(sub_node)
else:
child_node.data = str(elem)
node.add_child(child_node)
return node
# if a dom node has data only, then it's {'name':'data'}
# if a dom node has childs, then it's {'name':{}}
# if a dom node has data as well as childs, data will be ignored.
# if a dom has multi child with same name, it will be stored as list.
def dom2dict(dom, replace_li = True):
dictionary = {}
for child in dom.childs:
name = child.name
content = None
if len(child.childs) != 0:
content = dom2dict(child, replace_li)
else:
content = child.data
if content is None:
content = ''
content = utils.clean_text(content)
if name in dictionary:
if type(dictionary[name]) is not list:
previous = dictionary[name]
dictionary[name] = [previous, content]
else:
dictionary[name].append(content)
else:
dictionary[name] = content
if replace_li:
for key in dictionary:
item = dictionary[key]
if type(item) is dict:
li = None
if len(item.keys()) == 1:
for subkey in item:
if subkey == 'li':
li = item[subkey]
if li is not None:
dictionary[key] = li
return dictionary
class simple_parser(HTMLParser):
def __init__(self):
super(simple_parser, self).__init__()
self.root = dom_node('root')
self.current_node = self.root
def handle_starttag(self, tag, attrs):
attrs_dict = attribute_string_to_dict(attrs)
this_node = dom_node(tag, attrs_dict)
self.current_node.add_child(this_node)
self.current_node = this_node
def handle_endtag(self, tag):
self.current_node = self.current_node.father
def handle_data(self, data):
if self.current_node.data is None:
self.current_node.data = data
else:
self.current_node.data += data

19
lib/screen.py Normal file
View File

@ -0,0 +1,19 @@
import sys
class VirtualScreen():
def __init__(self, max_history=1000):
self.max_history = max_history
self.contents = []
def write(self, message):
self.contents.append(message)
def last(self, line=10, output=sys.stdout):
num_lines = len(self.contents)
start_line = num_lines - line
if start_line < 0:
start_line = 0
display = self.contents[start_line:]
for line in display:
output.write(line)
output.write('\n')

244
lib/service.py Normal file
View File

@ -0,0 +1,244 @@
import time
import sys
import shlex
import argparse
from croniter import croniter
from . import utils
from . import parallel
from . import console
from . import screen
from . import utils
class service():
def __init__(self, action, args=[], kwargs={}, cron='* * * * *', managed_output=False, name='service'):
self.name = name
self.action = action
self.managed_output = managed_output
self.args = args
self.kwargs = kwargs
self.output = sys.stdout
self.last_result = None
self.cronexpr = cron
self.croniter = croniter(self.cronexpr, time.time())
self.next_time = self.croniter.get_next()
def run(self, daemon=None, dry=False):
if not dry:
self.next_time = self.croniter.get_next()
new_args = []
if self.managed_output:
new_args = [self.output, *self.args]
else:
new_args = self.args
if daemon is None:
self.last_result = self.action(*new_args, **self.kwargs)
else:
daemon.add_job(self.action, new_args, self.kwargs, self.name)
class ServiceManager():
def __init__(self, debug=False, output=sys.stdout):
self.debug = debug
self.services = {}
self.deleted_services = {}
self.protected_service = []
self.daemon = parallel.ParallelHost()
self.sid = 0
self.terminate = False
self.output = output
self.set_refresh_time()
def stop(self):
self.daemon.stop()
self.terminate = True
def __del__(self):
self.stop()
def log(self, *args, end='\n'):
self.output.write('[{0}]'.format(utils.str_time()))
for arg in args:
arg = str(arg)
self.output.write(arg)
self.output.write(end)
def add(self, service, protected=False):
self.sid += 1
service.output = self.output
self.services[self.sid] = service
if protected:
self.protected_service.append(self.sid)
return self.sid
def delete(self, sid):
if sid in self.protected_service:
self.log('Can not delete protected service.')
return
if sid in self.services:
self.deleted_services[sid] = self.services[sid]
del self.services[sid]
else:
self.log('The sid [{0}] do not exist!'.format(sid))
def recover(self, sid):
if sid in self.deleted_services:
self.services[sid] = self.deleted_services[sid]
del self.deleted_services[sid]
else:
self.log('The sid [{0}] is not found recycle bin.'.format(sid))
def set_refresh_time(self, refresh_cron='* * * * *'):
def refresh():
pass
refresh_service = service(refresh, cron=refresh_cron, name='refresh')
self.add(refresh_service, protected = True)
def get_next(self):
next_sid = -1
next_time = -1
for sid in self.services:
service = self.services[sid]
if service.next_time < next_time or next_sid < 0:
next_sid = sid
next_time = service.next_time
return next_sid, next_time
def loop(self):
while not self.terminate:
next_sid, next_time = self.get_next()
service = self.services[next_sid]
sleep_time = next_time - time.time()
if sleep_time > 0:
time.sleep(sleep_time)
self.log('Running service {0} (SID={1})'.format(service.name, next_sid))
if next_sid in self.services:
service.run(self.daemon)
else:
self.log('the sheduled service wiil not run since it is canceled.')
# mode: background: return immidietly
# foreground: stuck here.
def start(self, mode='background'):
if mode == 'background':
self.daemon.add_job(self.loop, name='service main loop')
else:
self.loop()
def get_service_console(manager, name='service'):
con = console.console(name)
def command_show(args):
print('Active services:')
for sid in manager.services:
print('SID: {0} | Name: {1}'.format(sid, manager.services[sid].name))
print('Deleted services:')
for sid in manager.deleted_services:
print('SID: {0} | Name: {1}'.format(sid, manager.deleted_services[sid].name))
def command_add(args):
parser = argparse.ArgumentParser()
parser.add_argument('cron', type=str, help='A cron expr')
parser.add_argument('task', type=str, help='task to run, should be a valid command')
parser.add_argument('--name', '-n', type=str, default='command service', help='name of the task')
args = shlex.split(args)
args = parser.parse_args(args)
cron = args.cron
if not croniter.is_valid(cron):
print('Invalid cron expression.')
task = args.task
name = args.name
service_to_add = service(con.execute, args=[task], cron=cron, name=name)
manager.add(service_to_add)
def command_delete(args):
sid = None
if args.isdigit():
if int(args) in manager.services:
sid = int(args)
if sid is not None:
manager.delete(sid)
else:
print('command arugment \"{0}\" is not understood.'.format(args))
def command_recover(args):
sid = None
if args.isdigit():
if int(args) in manager.deleted_services:
sid = int(args)
if sid is not None:
manager.recover(sid)
else:
print('command arugment \"{0}\" is not understood.'.format(args))
def command_run(args):
sid = None
if args.isdigit():
if int(args) in manager.services:
sid = int(args)
if sid is not None:
manager.services[sid].run(dry=True)
else:
print('command arugment \"{0}\" is not understood.'.format(args))
def command_info(args):
line = None
if args != '':
if args.isdigit():
line = int(args)
if line is None:
line = 10
manager.output.last(line)
def command_next(args):
next_sid, next_time = manager.get_next()
info = ''
indent = ' '
info += 'Next Job: {0}'.format(manager.services[next_sid].name)
info += '\n{0}SID: {1}'.format(indent, next_sid)
info += '\n{0}Scheduled Running Time: {1}'.format(indent, utils.time2str(next_time))
info += '\n{0}Remeaning Time: {1}s'.format(indent, utils.float2str(next_time-time.time()))
print(info)
con.regist('show', command_show, help_info='Show all services.', alias=['ls'])
con.regist('run', command_run, help_info='Run a service.')
con.regist('info', command_info, help_info='Display service output log.')
con.regist('next', command_next, help_info='Next job to run.')
con.regist('add', command_add, help_info='Register a command as service.')
con.regist('delete', command_delete, help_info='Delete a service', alias=['del'])
con.regist('recover', command_recover, help_info='Recover a service.')
return con
if __name__ == '__main__':
def func1(output):
output.write('func1')
def func2(output):
output.write('func2')
def add(a, b):
print('{0} + {1} = {2}'.format(a, b, a+b))
def command_add(args):
numbers = args.split(' ')
a = float(numbers[0])
b = float(numbers[1])
add(a, b)
log_screen = screen.VirtualScreen()
manager = ServiceManager(output=log_screen)
test1 = service(func1, cron='* * * * *', name='test1', managed_output=True)
test2 = service(func2, cron='* * * * *', name='test2', managed_output=True)
manager.add(test1)
manager.add(test2)
manager.start('background')
con = get_service_console(manager)
master = console.console()
master.regist('service', con, help_info='service console')
master.regist('add', command_add, help_info='Add two numbers.')
master.interactive()

15
lib/try.py Normal file
View File

@ -0,0 +1,15 @@
def func(a, b, c, time=0, work=1):
print('a:{0} b:{1} c:{2}'.format(a, b, c))
print('time:{0} work:{1}'.format(time, work))
def funcwrap(func, kargs, kkargs):
func(*kargs, **kkargs)
kargs = [1, 2, 3]
kkargs = {
"time":1234,
"work":1232
}
funcwrap(func, kargs, kkargs)

139
lib/utils.py Normal file
View File

@ -0,0 +1,139 @@
import pickle
import time
import os
import re
import platform
def detect_platform():
p = 'Unknown'
if platform.platform().find('Windows') != -1:
p = 'Windows'
elif platform.platform().find('Linux') != -1:
p = 'Linux'
return p
def ensure_dir_exist(directory, show_info = True):
exist = os.path.isdir(directory)
if not exist:
print('directory', directory, ' not found, creating...')
os.mkdir(directory)
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, " ", title) # 替换为空格
return new_title
def list2csv(l):
csv = ''
for item in l:
csv += str(item) + ','
csv = csv[:-1]
return csv
def clean_text(string):
if string is None:
return ''
while '\n' in string:
string = string.replace('\n', ' ')
splits = clean_split(string)
string = ''
for split in splits:
string += split + ' '
string = string[:-1]
return string
def clean_split(string, delimiter=' '):
sub_strs = string.split(delimiter)
splits = []
for sub_str in sub_strs:
if sub_str is not '':
splits.append(sub_str)
return splits
def remove_blank_in_endpoint(string):
length = len(string)
first_index = 0
for i in range(length):
if is_blank(string[first_index]):
first_index += 1
else:
break
last_index = length - 1
for i in range(length):
if is_blank(string[last_index]):
last_index -= 1
else:
break
last_index += 1
return string[first_index:last_index]
def is_blank(ch):
blank_ch = [' ', '\t', '\n']
if ch in blank_ch:
return True
else:
return False
def dict_to_arrtibute_string(attributes):
string = ''
for key in attributes:
string += key + '=\"{0}\";'.format(str(attributes[key]))
return string
def attribute_string_to_dict(attrs):
attr_dict = {}
for attr in attrs:
attr_dict[attr[0]] = attr[1]
return attr_dict
def save_python_object(obj, save_path):
with open(save_path, 'wb') as file:
pickle.dump(obj, file)
def load_python_object(path):
with open(path, 'rb') as file:
return pickle.load(file)
def delete_n(string):
while '\n' in string:
string = string.replace('\n', ' ')
return string
def remove_additional_blank(string):
words = string.split(' ')
string = ''
for word in words:
if word is not '':
string += word + ' '
return string[:-1]
def formal_text(text):
text = delete_n(text)
text = remove_additional_blank(text)
return text
def float2str(f, precision=2):
f = str(f)
f_base = f[:f.find('.') + precision]
return f_base
# ========== time realted operation ========== #
def str_day():
day = time.strftime("%Y-%m-%d", time.localtime())
return day
def time2str(t):
localtime = time.localtime(int(t))
return str_time(localtime)
def str_time(local_time = None):
if local_time is None:
local_time = time.localtime()
day = time.strftime("%Y-%m-%d-%Hh-%Mm-%Ss)", local_time)
return day
if __name__ == '__main__':
print(str_day())

40
main.py Normal file
View File

@ -0,0 +1,40 @@
import arxiv_bot
import email_sender
import subscriber_utils
import feeds
from lib import utils
import os
from lib import service
from lib.console import console
from lib import screen
subscribe_manager = subscriber_utils.subscribe_manager()
# subscribe_manager.load()
bot = arxiv_bot.arxiv_bot(subscribe_manager.get_subscribed_topics())
feeds_generator = feeds.feed_manager(subscribe_manager, bot)
emailer = email_sender.arxiv_emailer(bot, feeds_generator, debug=False)
log_screen = screen.VirtualScreen()
manager = service.ServiceManager(output=log_screen)
daily_mail_service = service.service(
emailer.send_daily_email,
cron='0 4 * * 1-5',
name = 'send daily email'
)
manager.add(daily_mail_service)
shell = console('ArxivBot')
def command_load(args):
if args == 'subscriber':
subscribe_manager.load()
shell.regist('load', command_load, help_info='load config. (only subscriber supported till now)')
service_shell = service.get_service_console(manager, 'ServiceManager')
shell.regist('service', service_shell, help_info='service mamager')
# cron time:
# min hour day month week
manager.start()
shell.interactive()

92
subscriber_utils.py Normal file
View File

@ -0,0 +1,92 @@
from lib.parser import dom_node, simple_parser
class subscribe_manager():
def __init__(self, subscriber_config = './config/subscriber.xml'):
self.subscriber_config = None
self.subscribers = {}
if subscriber_config is not None:
self.subscriber_config = subscriber_config
self.load()
def show(self):
if self.subscribers is None:
print('No subscriber found!')
else:
for name in self.subscribers:
print('Name:', name, 'Email:', self.subscribers[name]['email'])
def load(self, path=None):
if path is None:
path = self.subscriber_config
if path is None:
return None
tree = None
with open(path, 'r') as f:
xml = f.read()
parser = simple_parser()
parser.feed(xml)
tree = parser.root
subscribers = {}
if tree is not None:
for person in tree.childs:
person_name = None
person_email = None
person_topics = []
person_keywords = []
for item in person.childs:
if item.name == 'name':
person_name = item.data
elif item.name == 'email':
person_email = item.data
elif item.name == 'topics':
for topic in item.childs:
if topic.name == 'topic':
person_topics.append(topic.data)
elif item.name == 'keywords':
for keyword in item.childs:
if keyword.name == 'keyword':
person_keywords.append(keyword.data)
if person_name is not None and person_email is not None and person_topics is not None:
subscriber = {}
subscriber['keywords'] = person_keywords
subscriber['email'] = person_email
subscriber['topics'] = person_topics
subscribers[person_name] = subscriber
self.subscribers = subscribers
print('Subscriber load success! All subscribers are shown below:')
self.show();
def get_subscribed_topics(self):
topics = []
for name in self.subscribers:
subscriber = self.subscribers[name]
topics += subscriber['topics']
topics = set(topics)
return topics
def get_subscribed_keywords(self):
keywords = []
for name in self.subscribers:
keywords += self.subscribers[name]['keywords']
keywords = set(keywords)
return keywords
def get_keywords_of_topics(self):
keywords_of_topics = {}
for name in self.subscribers:
subscriber = self.subscribers[name]
topic_group = subscriber['topics']
for topic in topic_group:
if topic not in keywords_of_topics:
keywords_of_topics[topic] = []
keywords_of_topics[topic] += subscriber['keywords']
return keywords_of_topics
if __name__ == '__main__':
manager = subscribe_manager()
print(manager.subscribers)
print(manager.get_subscribed_topics())
print(manager.get_subscribed_keywords())
print(manager.get_keywords_of_topics())

21
try.py Normal file
View File

@ -0,0 +1,21 @@
import arxiv_service
import time
now = arxiv_service.cron_time(time.localtime(time.time()))
# now.show()
# while True:
# now.next_day()
# now.show()
# running time
# minute hour day month week year
# * means always.
# a-b means from a to b (a and b included)
# a means run at this time.
# must match all to execute a command.
schedule = arxiv_service.cron_expr('0 0 29 2 * *')
for i in range(10):
now = schedule.next_run(now)
now.show()
print(now.to_struct_time())