# # pangocheck.py: data and methods for checking pango markup strings # # Copyright (C) 2014 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2.1 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . # # Author: David Shea import re from collections import Counter __all__ = ["markup_nodes", "is_markup", "markup_match"] # "a" isn't actually pango markup, but GtkLabel uses it markup_nodes = ["markup", "a", "b", "big", "i", "s", "span", "sub", "sup", "small", "tt", "u"] # Check to see if a string looks like Pango markup, no validation def is_markup(test_string): return any(re.search(r'<\s*%s(\s|>)' % node_type, test_string) for node_type in markup_nodes) # Verify that the translation of a markup string looks more or less like the original def markup_match(orig_markup, xlated_markup): # Look for tags. Create a count of each kind of tag and a list of attributes. # "Don't parse XML with regular expressions" I can hear you saying, but we're # not trying to match elements, just pull tag-like substrings out of the string. # Figuring out if tags are closed or in the right order is someone else's job. def _parse_markup(markup_string): name_count = Counter() attr_count = Counter() for tag in re.findall(r'<[^>]*>', markup_string): # Treat everything up to the first space, / or > as the element name (name, rest) = re.match(r'<([^\s/>]*)(.*)>', tag).groups() name_count[name] += 1 # Strip the / from the rest of the tag, if present if rest.endswith('/'): rest = rest[:-1] # Make a list of attributes that need to be contained in the other string attr_count.update(rest.split()) return (name_count, attr_count) (name_count1, attr_count1) = _parse_markup(orig_markup) (name_count2, attr_count2) = _parse_markup(xlated_markup) name_list1 = sorted(name_count1.elements()) name_list2 = sorted(name_count2.elements()) attr_list1 = sorted(attr_count1.elements()) attr_list2 = sorted(attr_count2.elements()) return (name_list1 == name_list2) and (attr_list1 == attr_list2) # Check that the markup is needed at all. # The input is a parsed ElementTree of the string 'pango markup goes here' # The markup is unnecessary if the only markup in the string surrounds the entire rest of # the string, meaning that the pango attributes apply to the entire string, and thus # could be expressed using attribute lists. For example, strings like: # Bold text # or # colorful # but not strings like: # This string contains internal markup # that contain markup that must be passed to the translators. # # This function returns True if the markup is necessary and False if the markup # can be discarded and expressed as attribute lists. def markup_necessary(markup_tree): # If the element has no children at all, there is no markup inside and the # markup is unnecessary. if not len(markup_tree): return False # If there is more than one child, the markup is necessary if len(markup_tree) > 1: return True # QUICK NOTE FOR PEOPLE EXPECTING ElementTree TO ACT KINDA LIKE DOM 'CUZ LOL # ElementTree is kind of weird with respect to handling multiple text children # of an Element node. element.text is the text leading up to the first element # child, and element[child_idx].tail is the text following the child node that # is actually a child of element but isn't a property of element because Python # is crazy. # # A string like "word1word2word3word4" will result in # tree == # tree.text == 'word1' # tree[0] == # tree[0].text == 'word2' # tree[0].tail == 'word3' # tree[1] == # tree[1].text == None # tree[1].text == 'word4' # # So elements that contain text before a child markup element will have # element.text is not None. Elements that have text after a child element # will have .tail on that child set to not None. # If .text is set, there is text before the child node, as in # text child # and the markup is necessary if markup_tree.text: return True # If the child (we already know there's only one) has .tail set, then # there is text between the close of the child and the end of the element # and the markup is necessary if markup_tree[0].tail: return True # Recurse on the child node return markup_necessary(markup_tree[0])