127 lines
5.2 KiB
Python
127 lines
5.2 KiB
Python
|
#
|
||
|
# pangocheck.py: data and methods for checking pango markup strings
|
||
|
#
|
||
|
# Copyright (C) 2014 Red Hat, Inc.
|
||
|
#
|
||
|
# This program is free software; you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU Lesser General Public License as published
|
||
|
# by the Free Software Foundation; either version 2.1 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# This program is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU Lesser General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU Lesser General Public License
|
||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
#
|
||
|
# Author: David Shea <dshea@redhat.com>
|
||
|
|
||
|
import re
|
||
|
from collections import Counter
|
||
|
|
||
|
__all__ = ["markup_nodes", "is_markup", "markup_match"]
|
||
|
|
||
|
# "a" isn't actually pango markup, but GtkLabel uses it
|
||
|
markup_nodes = ["markup", "a", "b", "big", "i", "s", "span", "sub", "sup", "small", "tt", "u"]
|
||
|
|
||
|
# Check to see if a string looks like Pango markup, no validation
|
||
|
def is_markup(test_string):
|
||
|
return any(re.search(r'<\s*%s(\s|>)' % node_type, test_string)
|
||
|
for node_type in markup_nodes)
|
||
|
|
||
|
# Verify that the translation of a markup string looks more or less like the original
|
||
|
def markup_match(orig_markup, xlated_markup):
|
||
|
# Look for tags. Create a count of each kind of tag and a list of attributes.
|
||
|
# "Don't parse XML with regular expressions" I can hear you saying, but we're
|
||
|
# not trying to match elements, just pull tag-like substrings out of the string.
|
||
|
# Figuring out if tags are closed or in the right order is someone else's job.
|
||
|
def _parse_markup(markup_string):
|
||
|
name_count = Counter()
|
||
|
attr_count = Counter()
|
||
|
|
||
|
for tag in re.findall(r'<[^>]*>', markup_string):
|
||
|
# Treat everything up to the first space, / or > as the element name
|
||
|
(name, rest) = re.match(r'<([^\s/>]*)(.*)>', tag).groups()
|
||
|
name_count[name] += 1
|
||
|
|
||
|
# Strip the / from the rest of the tag, if present
|
||
|
if rest.endswith('/'):
|
||
|
rest = rest[:-1]
|
||
|
|
||
|
# Make a list of attributes that need to be contained in the other string
|
||
|
attr_count.update(rest.split())
|
||
|
|
||
|
return (name_count, attr_count)
|
||
|
|
||
|
(name_count1, attr_count1) = _parse_markup(orig_markup)
|
||
|
(name_count2, attr_count2) = _parse_markup(xlated_markup)
|
||
|
|
||
|
name_list1 = sorted(name_count1.elements())
|
||
|
name_list2 = sorted(name_count2.elements())
|
||
|
attr_list1 = sorted(attr_count1.elements())
|
||
|
attr_list2 = sorted(attr_count2.elements())
|
||
|
|
||
|
return (name_list1 == name_list2) and (attr_list1 == attr_list2)
|
||
|
|
||
|
# Check that the markup is needed at all.
|
||
|
# The input is a parsed ElementTree of the string '<markup>pango markup goes here</markup>'
|
||
|
# The markup is unnecessary if the only markup in the string surrounds the entire rest of
|
||
|
# the string, meaning that the pango attributes apply to the entire string, and thus
|
||
|
# could be expressed using attribute lists. For example, strings like:
|
||
|
# <b>Bold text</b>
|
||
|
# or
|
||
|
# <span foreground="grey"><i>colorful</i></span>
|
||
|
# but not strings like:
|
||
|
# <span size="small">This string contains <b>internal</b> markup</span>
|
||
|
# that contain markup that must be passed to the translators.
|
||
|
#
|
||
|
# This function returns True if the markup is necessary and False if the markup
|
||
|
# can be discarded and expressed as attribute lists.
|
||
|
def markup_necessary(markup_tree):
|
||
|
# If the element has no children at all, there is no markup inside and the
|
||
|
# markup is unnecessary.
|
||
|
if not len(markup_tree):
|
||
|
return False
|
||
|
|
||
|
# If there is more than one child, the markup is necessary
|
||
|
if len(markup_tree) > 1:
|
||
|
return True
|
||
|
|
||
|
# QUICK NOTE FOR PEOPLE EXPECTING ElementTree TO ACT KINDA LIKE DOM 'CUZ LOL
|
||
|
# ElementTree is kind of weird with respect to handling multiple text children
|
||
|
# of an Element node. element.text is the text leading up to the first element
|
||
|
# child, and element[child_idx].tail is the text following the child node that
|
||
|
# is actually a child of element but isn't a property of element because Python
|
||
|
# is crazy.
|
||
|
#
|
||
|
# A string like "<markup>word1<i>word2</i>word3<empty/>word4</markup>" will result in
|
||
|
# tree == <Element 'markup' ...>
|
||
|
# tree.text == 'word1'
|
||
|
# tree[0] == <Element 'i' ...>
|
||
|
# tree[0].text == 'word2'
|
||
|
# tree[0].tail == 'word3'
|
||
|
# tree[1] == <Element 'empty' ...>
|
||
|
# tree[1].text == None
|
||
|
# tree[1].text == 'word4'
|
||
|
#
|
||
|
# So elements that contain text before a child markup element will have
|
||
|
# element.text is not None. Elements that have text after a child element
|
||
|
# will have .tail on that child set to not None.
|
||
|
|
||
|
# If .text is set, there is text before the child node, as in
|
||
|
# <span>text <b>child</b></span>
|
||
|
# and the markup is necessary
|
||
|
if markup_tree.text:
|
||
|
return True
|
||
|
|
||
|
# If the child (we already know there's only one) has .tail set, then
|
||
|
# there is text between the close of the child and the end of the element
|
||
|
# and the markup is necessary
|
||
|
if markup_tree[0].tail:
|
||
|
return True
|
||
|
|
||
|
# Recurse on the child node
|
||
|
return markup_necessary(markup_tree[0])
|