Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_SAX_PARSER_HPP
9 #define ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
23  static const bool strict_xml_declaration = true;
24 };
25 
30 template<typename _Handler, typename _Config = sax_parser_default_config>
32 {
33 public:
34  typedef _Handler handler_type;
35  typedef _Config config_type;
36 
37  sax_parser(const char* content, const size_t size, handler_type& handler);
38  ~sax_parser();
39 
40  void parse();
41 
42 private:
43 
48  void header();
49  void body();
50  void element();
51  void element_open(const char* begin_pos);
52  void element_close(const char* begin_pos);
53  void special_tag();
54  void declaration(const char* name_check);
55  void cdata();
56  void doctype();
57  void characters();
58  void attribute();
59 
60 private:
61  handler_type& m_handler;
62 };
63 
64 template<typename _Handler, typename _Config>
66  const char* content, const size_t size, handler_type& handler) :
67  sax::parser_base(content, size),
68  m_handler(handler)
69 {
70 }
71 
72 template<typename _Handler, typename _Config>
74 {
75 }
76 
77 template<typename _Handler, typename _Config>
79 {
80  m_nest_level = 0;
81  mp_char = mp_begin;
82  header();
83  blank();
84  body();
85 
86  assert(m_buffer_pos == 0);
87 }
88 
89 template<typename _Handler, typename _Config>
91 {
92  // we don't handle multi byte encodings so we can just skip bom entry if exists.
93  skip_bom();
94  blank();
95  if (!has_char() || cur_char() != '<')
96  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
97 
98  if (config_type::strict_xml_declaration)
99  {
100  if (next_char_checked() != '?')
101  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
102 
103  declaration("xml");
104  }
105 }
106 
107 template<typename _Handler, typename _Config>
109 {
110  while (has_char())
111  {
112  if (cur_char() == '<')
113  {
114  element();
115  if (!m_root_elem_open)
116  // Root element closed. Stop parsing.
117  return;
118  }
119  else if (m_nest_level)
120  // Call characters only when in xml hierarchy.
121  characters();
122  else
123  next();
124  }
125 }
126 
127 template<typename _Handler, typename _Config>
129 {
130  assert(cur_char() == '<');
131  const char* pos = mp_char;
132  char c = next_char_checked();
133  switch (c)
134  {
135  case '/':
136  element_close(pos);
137  break;
138  case '!':
139  special_tag();
140  break;
141  case '?':
142  declaration(NULL);
143  break;
144  default:
145  if (!is_alpha(c))
146  throw sax::malformed_xml_error("expected an alphabet.", offset());
147  element_open(pos);
148  }
149 }
150 
151 template<typename _Handler, typename _Config>
152 void sax_parser<_Handler,_Config>::element_open(const char* begin_pos)
153 {
154  assert(is_alpha(cur_char()));
155 
156  sax::parser_element elem;
157  element_name(elem, begin_pos);
158 
159  while (true)
160  {
161  blank();
162  char c = cur_char();
163  if (c == '/')
164  {
165  // Self-closing element: <element/>
166  if (next_and_char() != '>')
167  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
168  next();
169  elem.end_pos = mp_char;
170  m_handler.start_element(elem);
171  reset_buffer_pos();
172  m_handler.end_element(elem);
173 #if ORCUS_DEBUG_SAX_PARSER
174  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
175 #endif
176  return;
177  }
178  else if (c == '>')
179  {
180  // End of opening element: <element>
181  next();
182  elem.end_pos = mp_char;
183  nest_up();
184  m_handler.start_element(elem);
185  reset_buffer_pos();
186 #if ORCUS_DEBUG_SAX_PARSER
187  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
188 #endif
189  return;
190  }
191  else
192  attribute();
193  }
194 }
195 
196 template<typename _Handler, typename _Config>
197 void sax_parser<_Handler,_Config>::element_close(const char* begin_pos)
198 {
199  assert(cur_char() == '/');
200  nest_down();
201  next_check();
202  sax::parser_element elem;
203  element_name(elem, begin_pos);
204 
205  if (cur_char() != '>')
206  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
207  next();
208  elem.end_pos = mp_char;
209 
210  m_handler.end_element(elem);
211 #if ORCUS_DEBUG_SAX_PARSER
212  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
213 #endif
214  if (!m_nest_level)
215  m_root_elem_open = false;
216 }
217 
218 template<typename _Handler, typename _Config>
220 {
221  assert(cur_char() == '!');
222  // This can be either <![CDATA, <!--, or <!DOCTYPE.
223  size_t len = remains();
224  if (len < 2)
225  throw sax::malformed_xml_error("special tag too short.", offset());
226 
227  switch (next_and_char())
228  {
229  case '-':
230  {
231  // Possibly comment.
232  if (next_and_char() != '-')
233  throw sax::malformed_xml_error("comment expected.", offset());
234 
235  len -= 2;
236  if (len < 3)
237  throw sax::malformed_xml_error("malformed comment.", offset());
238 
239  next();
240  comment();
241  }
242  break;
243  case '[':
244  {
245  // Possibly a CDATA.
246  expects_next("CDATA[", 6);
247  if (has_char())
248  cdata();
249  }
250  break;
251  case 'D':
252  {
253  // check if this is a DOCTYPE.
254  expects_next("OCTYPE", 6);
255  blank();
256  if (has_char())
257  doctype();
258  }
259  break;
260  default:
261  throw sax::malformed_xml_error("failed to parse special tag.", offset());
262  }
263 }
264 
265 template<typename _Handler, typename _Config>
266 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
267 {
268  assert(cur_char() == '?');
269  next_check();
270 
271  // Get the declaration name first.
272  pstring decl_name;
273  name(decl_name);
274 #if ORCUS_DEBUG_SAX_PARSER
275  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
276 #endif
277 
278  if (name_check && decl_name != name_check)
279  {
280  std::ostringstream os;
281  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
282  throw sax::malformed_xml_error(os.str(), offset());
283  }
284 
285  m_handler.start_declaration(decl_name);
286  blank();
287 
288  // Parse the attributes.
289  while (cur_char_checked() != '?')
290  {
291  attribute();
292  blank();
293  }
294  if (next_char_checked() != '>')
295  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
296 
297  m_handler.end_declaration(decl_name);
298  reset_buffer_pos();
299  next();
300 #if ORCUS_DEBUG_SAX_PARSER
301  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
302 #endif
303 }
304 
305 template<typename _Handler, typename _Config>
307 {
308  size_t len = remains();
309  assert(len > 3);
310 
311  // Parse until we reach ']]>'.
312  const char* p0 = mp_char;
313  size_t i = 0, match = 0;
314  for (char c = cur_char(); i < len; ++i, c = next_and_char())
315  {
316  if (c == ']')
317  {
318  // Be aware that we may encounter a series of more than two ']'
319  // characters, in which case we'll only count the last two.
320 
321  if (match == 0)
322  // First ']'
323  ++match;
324  else if (match == 1)
325  // Second ']'
326  ++match;
327  }
328  else if (c == '>' && match == 2)
329  {
330  // Found ']]>'.
331  size_t cdata_len = i - 2;
332  m_handler.characters(pstring(p0, cdata_len), false);
333  next();
334  return;
335  }
336  else
337  match = 0;
338  }
339  throw sax::malformed_xml_error("malformed CDATA section.", offset());
340 }
341 
342 template<typename _Handler, typename _Config>
344 {
345  // Parse the root element first.
347  name(param.root_element);
348  blank();
349 
350  // Either PUBLIC or SYSTEM.
351  size_t len = remains();
352  if (len < 6)
353  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
354 
355  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
356  char c = cur_char();
357  if (c == 'P')
358  {
359  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
360  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
361 
362  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
363  }
364  else if (c == 'S')
365  {
366  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
367  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
368  }
369 
370  next_check();
371  blank();
372  has_char_throw("DOCTYPE section too short.");
373 
374  // Parse FPI.
375  value(param.fpi, false);
376 
377  has_char_throw("DOCTYPE section too short.");
378  blank();
379  has_char_throw("DOCTYPE section too short.");
380 
381  if (cur_char() == '>')
382  {
383  // Optional URI not given. Exit.
384 #if ORCUS_DEBUG_SAX_PARSER
385  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
386 #endif
387  m_handler.doctype(param);
388  next();
389  return;
390  }
391 
392  // Parse optional URI.
393  value(param.uri, false);
394 
395  has_char_throw("DOCTYPE section too short.");
396  blank();
397  has_char_throw("DOCTYPE section too short.");
398 
399  if (cur_char() != '>')
400  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
401 
402 #if ORCUS_DEBUG_SAX_PARSER
403  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
404 #endif
405  m_handler.doctype(param);
406  next();
407 }
408 
409 template<typename _Handler, typename _Config>
411 {
412  const char* p0 = mp_char;
413  for (; has_char(); next())
414  {
415  if (cur_char() == '<')
416  break;
417 
418  if (cur_char() == '&')
419  {
420  // Text span with one or more encoded characters. Parse using cell buffer.
421  cell_buffer& buf = get_cell_buffer();
422  buf.reset();
423  buf.append(p0, mp_char-p0);
424  characters_with_encoded_char(buf);
425  if (buf.empty())
426  m_handler.characters(pstring(), false);
427  else
428  m_handler.characters(pstring(buf.get(), buf.size()), true);
429  return;
430  }
431  }
432 
433  if (mp_char > p0)
434  {
435  pstring val(p0, mp_char-p0);
436  m_handler.characters(val, false);
437  }
438 }
439 
440 template<typename _Handler, typename _Config>
442 {
444  pstring attr_ns_name, attr_name, attr_value;
445  attribute_name(attr.ns, attr.name);
446 
447 #if ORCUS_DEBUG_SAX_PARSER
448  std::ostringstream os;
449  os << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'";
450 #endif
451 
452  char c = cur_char();
453  if (c != '=')
454  {
455  std::ostringstream os;
456  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
457  throw sax::malformed_xml_error(os.str(), offset());
458  }
459 
460  next_check();
461  attr.transient = value(attr.value, true);
462  if (attr.transient)
463  // Value is stored in a temporary buffer. Push a new buffer.
464  inc_buffer_pos();
465 
466 #if ORCUS_DEBUG_SAX_PARSER
467  os << " value='" << attr.value << "'" << endl;
468  cout << os.str();
469 #endif
470 
471  m_handler.attribute(attr);
472 }
473 
474 }
475 
476 #endif
477 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: pstring.hpp:24
Definition: cell_buffer.hpp:21
Definition: sax_parser_base.hpp:33
Definition: sax_parser.hpp:15
static const bool strict_xml_declaration
Definition: sax_parser.hpp:23
Definition: sax_parser_base.hpp:87
Definition: sax_parser_base.hpp:72
Definition: sax_parser_base.hpp:45
Definition: base64.hpp:15
Definition: sax_parser.hpp:31
Definition: sax_parser_base.hpp:95