1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Manage the Universal Terminology eXchange (UTX) format
22
23 UTX is a format for terminology exchange, designed it seems with Machine
24 Translation (MT) as it's primary consumer. The format is created by
25 the Asia-Pacific Association for Machine Translation (AAMT).
26
27 It is a bilingual base class derived format with L{UtxFile}
28 and L{UtxUnit} providing file and unit level access.
29
30 The format can manage monolingual dictionaries but these classes don't
31 implement that.
32
33 Specification
34 =============
35 The format is implemented according to the v1.0 UTX
36 L{specification<http://www.aamt.info/english/utx/utx-simple-1.00-specification-e.pdf>}
37
38 Format Implementation
39 =====================
40 The UTX format is a Tab Seperated Value (TSV) file in UTF-8. The
41 first two lines are headers with subsequent lines containing a
42 single source target definition.
43
44 Encoding
45 --------
46 The files are UTF-8 encoded with no BOM and CR+LF line terminators.
47 """
48
49 import csv
50 import sys
51 import time
52
53 from translate.storage import base
54
55
73 csv.register_dialect("utx", UtxDialect)
74
75
77 """A UTX header entry
78
79 A UTX header is a single line that looks like this::
80 #UTX-S <version>; < source language >/< target language>;
81 <date created>; <optional fields (creator, license, etc.)>
82
83 Where::
84 - UTX-S version is currently 1.00.
85 - Source language/target language: ISO 639, 3166 formats.
86 In the case of monolingual dictionary, target language should be
87 omitted.
88 - Date created: ISO 8601 format
89 - Optional fields (creator, license, etc.)
90 """
91
92
93 -class UtxUnit(base.TranslationUnit):
94 """A UTX dictionary unit"""
95
101
103 """Get the dictionary of values for a UTX line"""
104 return self._dict
105
107 """Set the dictionary of values for a UTX line
108
109 @param newdict: a new dictionary with UTX line elements
110 @type newdict: Dict
111 """
112
113 self._dict = newdict
114 dict = property(getdict, setdict)
115
117 if key not in self._dict:
118 return None
119 elif self._dict[key]:
120 return self._dict[key].decode('utf-8')
121 else:
122 return ""
123
125
126 if newvalue is None:
127 self._dict[key] = None
128 if isinstance(newvalue, unicode):
129 newvalue = newvalue.encode('utf-8')
130 if not key in self._dict or newvalue != self._dict[key]:
131 self._dict[key] = newvalue
132
135
136 - def addnote(self, text, origin=None, position="append"):
137 currentnote = self._get_field('comment')
138 if position == "append" and currentnote is not None and currentnote != u'':
139 self._set_field('comment', currentnote + '\n' + text)
140 else:
141 self._set_field('comment', text)
142
145
148
152 source = property(getsource, setsource)
153
156
160 target = property(gettarget, settarget)
161
163 self._dict['target-lang'] = newlang
164 targetlang = property(None, settargetlang)
165
167 return str(self._dict)
168
170 return bool(self._dict.get('tgt', None))
171
172
173 -class UtxFile(base.TranslationStore):
174 """A UTX dictionary file"""
175 Name = _("UTX Simple Dictionary")
176 Mimetypes = ["text/x-utx"]
177 Extensions = ["utx"]
178
180 """Construct an UTX dictionary, optionally reading in from
181 inputfile."""
182 self.UnitClass = unitclass
183 base.TranslationStore.__init__(self, unitclass=unitclass)
184 self.filename = ''
185 self.extension = ''
186 self._fieldnames = ['src', 'tgt', 'src:pos']
187 self._header = {"version": "1.00",
188 "source_language": "en",
189 "date_created": time.strftime("%FT%TZ%z", time.localtime(time.time()))}
190 if inputfile is not None:
191 self.parse(inputfile)
192
194 """Read a UTX header"""
195 if header is None:
196 self._fieldnames = ['src', 'tgt', 'src:pos']
197
198 self._header = {"version": "1.00"}
199 return
200 header_lines = []
201 for line in header.split(UtxDialect.lineterminator):
202 if line.startswith("#"):
203 header_lines.append(line)
204 else:
205 break
206 self._header = {}
207 header_components = []
208 for line in header_lines[:-1]:
209 header_components += line[1:].split(";")
210 self._header["version"] = header_components[0].replace("UTX-S ", "")
211 languages = header_components[1].strip().split("/")
212 self._header["source_language"] = languages[0]
213 self._header["target_language"] = languages[1] or None
214 self._header["date_created"] = header_components[2].strip()
215 for data in header_components[3:]:
216 key, value = data.strip().split(":")
217 self._header[key] = value.strip()
218 self._fieldnames = header_lines[-1:][0].replace("#", ""). split('\t')
219 return len(header_lines)
220
222 """Create a UTX header"""
223 header = "#UTX-S %(version)s; %(src)s/%(tgt)s; %(date)s" % \
224 {"version": self._header["version"],
225 "src": self._header["source_language"],
226 "tgt": self._header.get("target_language", ""),
227 "date": self._header["date_created"],
228 }
229 items = []
230 for key, value in self._header.iteritems():
231 if key in ["version", "source_language", "target_language", "date_created"]:
232 continue
233 items.append("%s: %s" % (key, value))
234 if len(items):
235 items = "; ".join(items)
236 header += "; " + items
237 header += UtxDialect.lineterminator
238 header += "#" + "\t".join(self._fieldnames) + UtxDialect.lineterminator
239 return header
240
242 return self._header.get("source_language", None)
243
246
248 return self._header.get("target_language", None)
249
252
254 """parsese the given file or file source string"""
255 if hasattr(input, 'name'):
256 self.filename = input.name
257 elif not getattr(self, 'filename', ''):
258 self.filename = ''
259 if hasattr(input, "read"):
260 tmsrc = input.read()
261 input.close()
262 input = tmsrc
263 try:
264 header_length = self._read_header(input)
265 except:
266 raise base.ParseError("Cannot parse header")
267 lines = csv.DictReader(input.split(UtxDialect.lineterminator)[header_length:],
268 fieldnames=self._fieldnames,
269 dialect="utx")
270 for line in lines:
271 newunit = UtxUnit()
272 newunit.dict = line
273 self.addunit(newunit)
274
276 output = csv.StringIO()
277 writer = csv.DictWriter(output, fieldnames=self._fieldnames,
278 dialect="utx")
279 unit_count = 0
280 for unit in self.units:
281 if unit.istranslated():
282 unit_count += 1
283 writer.writerow(unit.dict)
284 if unit_count == 0:
285 return ""
286 output.reset()
287 return self._write_header() + "".join(output.readlines())
288