1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Helper functions for working with XML."""
22
23 import re
24
25 from lxml import etree
26
27
28 xml_preserve_ancestors = etree.XPath("ancestor-or-self::*[attribute::xml:space='preserve']")
29 """All ancestors with xml:space='preserve'"""
30
31 xml_space_ancestors = etree.XPath("ancestor-or-self::*/attribute::xml:space")
32 """All xml:space attributes in the ancestors"""
33
34 string_xpath = etree.XPath("string()")
35 """Return a non-normalized string in the node subtree"""
36
37 string_xpath_normalized = etree.XPath("normalize-space()")
38 """Return a (space) normalized string in the node subtree"""
39
40
41 -def getText(node, xml_space="preserve"):
42 """Extracts the plain text content out of the given node.
43
44 This method checks the xml:space attribute of the given node, and takes
45 an optional default to use in case nothing is specified in this node."""
46 xml_space = getXMLspace(node, xml_space)
47 if xml_space == "default":
48 return unicode(string_xpath_normalized(node))
49 else:
50 return unicode(string_xpath(node))
51
52
53
54
55
56
57
58
59
60
61 XML_NS = 'http://www.w3.org/XML/1998/namespace'
62
63
65 """Gets the xml:lang attribute on node"""
66 return node.get("{%s}lang" % XML_NS)
67
68
70 """Sets the xml:lang attribute on node"""
71 node.set("{%s}lang" % XML_NS, lang)
72
73
80
81
83 """Sets the xml:space attribute on node"""
84 node.set("{%s}space" % XML_NS, value)
85
86
88 """Returns name in Clark notation within the given namespace.
89
90 For example namespaced("source") in an XLIFF document might return::
91 {urn:oasis:names:tc:xliff:document:1.1}source
92 This is needed throughout lxml.
93 """
94 if namespace:
95 return "{%s}%s" % (namespace, name)
96 else:
97 return name
98
99 MULTIWHITESPACE_PATTERN = r"[\n\r\t ]+"
100 MULTIWHITESPACE_RE = re.compile(MULTIWHITESPACE_PATTERN, re.MULTILINE)
101
102
107
108
110 """normalize spaces following the nodes xml:space, or alternatively the
111 given xml_space parameter."""
112 xml_space = getXMLspace(node) or xml_space
113 if xml_space == 'preserve':
114 return
115 if node.text:
116 node.text = normalize_space(node.text)
117 if remove_start and node.text[0] == u" ":
118 node.text = node.text.lstrip()
119 remove_start = False
120 if len(node.text) > 0 and node.text.endswith(u" "):
121 remove_start = True
122 if len(node) == 0:
123 node.text = node.text.rstrip()
124 if node.tail:
125 node.tail = normalize_space(node.tail)
126
127 for child in node:
128 normalize_xml_space(child, remove_start)
129