ArxivRobot/lib/parser.py
2019-12-23 12:42:31 +08:00

151 lines
4.1 KiB
Python

from html.parser import HTMLParser
from . import utils
def dict_to_arrtibute_string(attributes):
string = ''
for key in attributes:
string += key + '=\"{0}\";'.format(str(attributes[key]))
return string
def attribute_string_to_dict(attrs):
attr_dict = {}
for attr in attrs:
attr_dict[attr[0]] = attr[1]
return attr_dict
class dom_node():
def __init__(self, name = None, attributes = {}):
if name is not None:
self.name = name
else:
self.name = 'Node'
self.attributes = attributes
self.childs = []
self.data = None
self.father = None
def add_child(self, child):
if child is not None:
child.father = self
self.childs.append(child)
def to_string(self, prefix='', indent=' '):
string = prefix + '<' + self.name
if self.attributes:
string += ' ' + dict_to_arrtibute_string(self.attributes)
string += '>\n'
for child in self.childs:
string += child.to_string(prefix=prefix+indent, indent=indent)
if self.data is not None:
string += prefix + indent + self.data + '\n'
string += prefix + '</{0}>\n'.format(self.name)
return string
def has_child(self, name):
has = False
for child in self.childs:
if child.name == name:
has = True
break;
return has
def search(self, name):
founded_node = []
if type(name) is list:
if self.name in name:
founded_node.append(self)
else:
if self.name == name:
founded_node.append(self)
for child in self.childs:
search_result = child.search(name)
founded_node += search_result
return founded_node
def dict2dom(d, root_name='root'):
node = dom_node(root_name)
for key in d:
elem = d[key]
child_node = dom_node(name=str(key))
if type(elem) is dict:
child_node = dict2dom(elem, root_name=str(key))
elif type(elem) is list:
for subelem in elem:
if type(subelem) is dict:
sub_node = dict2dom(subelem, root_name='li')
child_node.add_child(sub_node)
else:
sub_node = dom_node('li')
sub_node.data = str(subelem)
child_node.add_child(sub_node)
else:
child_node.data = str(elem)
node.add_child(child_node)
return node
# if a dom node has data only, then it's {'name':'data'}
# if a dom node has childs, then it's {'name':{}}
# if a dom node has data as well as childs, data will be ignored.
# if a dom has multi child with same name, it will be stored as list.
def dom2dict(dom, replace_li = True):
dictionary = {}
for child in dom.childs:
name = child.name
content = None
if len(child.childs) != 0:
content = dom2dict(child, replace_li)
else:
content = child.data
if content is None:
content = ''
content = utils.clean_text(content)
if name in dictionary:
if type(dictionary[name]) is not list:
previous = dictionary[name]
dictionary[name] = [previous, content]
else:
dictionary[name].append(content)
else:
dictionary[name] = content
if replace_li:
for key in dictionary:
item = dictionary[key]
if type(item) is dict:
li = None
if len(item.keys()) == 1:
for subkey in item:
if subkey == 'li':
li = item[subkey]
if li is not None:
dictionary[key] = li
return dictionary
class simple_parser(HTMLParser):
def __init__(self):
super(simple_parser, self).__init__()
self.root = dom_node('root')
self.current_node = self.root
def handle_starttag(self, tag, attrs):
attrs_dict = attribute_string_to_dict(attrs)
this_node = dom_node(tag, attrs_dict)
self.current_node.add_child(this_node)
self.current_node = this_node
def handle_endtag(self, tag):
self.current_node = self.current_node.father
def handle_data(self, data):
if self.current_node.data is None:
self.current_node.data = data
else:
self.current_node.data += data