from html.parser import HTMLParser from . import utils def dict_to_arrtibute_string(attributes): string = '' for key in attributes: string += key + '=\"{0}\";'.format(str(attributes[key])) return string def attribute_string_to_dict(attrs): attr_dict = {} for attr in attrs: attr_dict[attr[0]] = attr[1] return attr_dict class dom_node(): def __init__(self, name = None, attributes = {}): if name is not None: self.name = name else: self.name = 'Node' self.attributes = attributes self.childs = [] self.data = None self.father = None def add_child(self, child): if child is not None: child.father = self self.childs.append(child) def to_string(self, prefix='', indent=' '): string = prefix + '<' + self.name if self.attributes: string += ' ' + dict_to_arrtibute_string(self.attributes) string += '>\n' for child in self.childs: string += child.to_string(prefix=prefix+indent, indent=indent) if self.data is not None: string += prefix + indent + self.data + '\n' string += prefix + '\n'.format(self.name) return string def has_child(self, name): has = False for child in self.childs: if child.name == name: has = True break; return has def search(self, name): founded_node = [] if type(name) is list: if self.name in name: founded_node.append(self) else: if self.name == name: founded_node.append(self) for child in self.childs: search_result = child.search(name) founded_node += search_result return founded_node def dict2dom(d, root_name='root'): node = dom_node(root_name) for key in d: elem = d[key] child_node = dom_node(name=str(key)) if type(elem) is dict: child_node = dict2dom(elem, root_name=str(key)) elif type(elem) is list: for subelem in elem: if type(subelem) is dict: sub_node = dict2dom(subelem, root_name='li') child_node.add_child(sub_node) else: sub_node = dom_node('li') sub_node.data = str(subelem) child_node.add_child(sub_node) else: child_node.data = str(elem) node.add_child(child_node) return node # if a dom node has data only, then it's {'name':'data'} # if a dom node has childs, then it's {'name':{}} # if a dom node has data as well as childs, data will be ignored. # if a dom has multi child with same name, it will be stored as list. def dom2dict(dom, replace_li = True): dictionary = {} for child in dom.childs: name = child.name content = None if len(child.childs) != 0: content = dom2dict(child, replace_li) else: content = child.data if content is None: content = '' content = utils.clean_text(content) if name in dictionary: if type(dictionary[name]) is not list: previous = dictionary[name] dictionary[name] = [previous, content] else: dictionary[name].append(content) else: dictionary[name] = content if replace_li: for key in dictionary: item = dictionary[key] if type(item) is dict: li = None if len(item.keys()) == 1: for subkey in item: if subkey == 'li': li = item[subkey] if li is not None: dictionary[key] = li return dictionary class simple_parser(HTMLParser): def __init__(self): super(simple_parser, self).__init__() self.root = dom_node('root') self.current_node = self.root def handle_starttag(self, tag, attrs): attrs_dict = attribute_string_to_dict(attrs) this_node = dom_node(tag, attrs_dict) self.current_node.add_child(this_node) self.current_node = this_node def handle_endtag(self, tag): self.current_node = self.current_node.father def handle_data(self, data): if self.current_node.data is None: self.current_node.data = data else: self.current_node.data += data