#
# pangocheck.py: data and methods for checking pango markup strings
#
# Copyright (C) 2014 Red Hat, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
#
# Author: David Shea
import re
from collections import Counter
__all__ = ["markup_nodes", "is_markup", "markup_match"]
# "a" isn't actually pango markup, but GtkLabel uses it
markup_nodes = ["markup", "a", "b", "big", "i", "s", "span", "sub", "sup", "small", "tt", "u"]
# Check to see if a string looks like Pango markup, no validation
def is_markup(test_string):
return any(re.search(r'<\s*%s(\s|>)' % node_type, test_string)
for node_type in markup_nodes)
# Verify that the translation of a markup string looks more or less like the original
def markup_match(orig_markup, xlated_markup):
# Look for tags. Create a count of each kind of tag and a list of attributes.
# "Don't parse XML with regular expressions" I can hear you saying, but we're
# not trying to match elements, just pull tag-like substrings out of the string.
# Figuring out if tags are closed or in the right order is someone else's job.
def _parse_markup(markup_string):
name_count = Counter()
attr_count = Counter()
for tag in re.findall(r'<[^>]*>', markup_string):
# Treat everything up to the first space, / or > as the element name
(name, rest) = re.match(r'<([^\s/>]*)(.*)>', tag).groups()
name_count[name] += 1
# Strip the / from the rest of the tag, if present
if rest.endswith('/'):
rest = rest[:-1]
# Make a list of attributes that need to be contained in the other string
attr_count.update(rest.split())
return (name_count, attr_count)
(name_count1, attr_count1) = _parse_markup(orig_markup)
(name_count2, attr_count2) = _parse_markup(xlated_markup)
name_list1 = sorted(name_count1.elements())
name_list2 = sorted(name_count2.elements())
attr_list1 = sorted(attr_count1.elements())
attr_list2 = sorted(attr_count2.elements())
return (name_list1 == name_list2) and (attr_list1 == attr_list2)
# Check that the markup is needed at all.
# The input is a parsed ElementTree of the string 'pango markup goes here'
# The markup is unnecessary if the only markup in the string surrounds the entire rest of
# the string, meaning that the pango attributes apply to the entire string, and thus
# could be expressed using attribute lists. For example, strings like:
# Bold text
# or
# colorful
# but not strings like:
# This string contains internal markup
# that contain markup that must be passed to the translators.
#
# This function returns True if the markup is necessary and False if the markup
# can be discarded and expressed as attribute lists.
def markup_necessary(markup_tree):
# If the element has no children at all, there is no markup inside and the
# markup is unnecessary.
if not len(markup_tree):
return False
# If there is more than one child, the markup is necessary
if len(markup_tree) > 1:
return True
# QUICK NOTE FOR PEOPLE EXPECTING ElementTree TO ACT KINDA LIKE DOM 'CUZ LOL
# ElementTree is kind of weird with respect to handling multiple text children
# of an Element node. element.text is the text leading up to the first element
# child, and element[child_idx].tail is the text following the child node that
# is actually a child of element but isn't a property of element because Python
# is crazy.
#
# A string like "word1word2word3word4" will result in
# tree ==
# tree.text == 'word1'
# tree[0] ==
# tree[0].text == 'word2'
# tree[0].tail == 'word3'
# tree[1] ==
# tree[1].text == None
# tree[1].text == 'word4'
#
# So elements that contain text before a child markup element will have
# element.text is not None. Elements that have text after a child element
# will have .tail on that child set to not None.
# If .text is set, there is text before the child node, as in
# text child
# and the markup is necessary
if markup_tree.text:
return True
# If the child (we already know there's only one) has .tail set, then
# there is text between the close of the child and the end of the element
# and the markup is necessary
if markup_tree[0].tail:
return True
# Recurse on the child node
return markup_necessary(markup_tree[0])