Orcus
csv_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_CSV_PARSER_HPP
9 #define ORCUS_CSV_PARSER_HPP
10 
11 #include "csv_parser_base.hpp"
12 
13 namespace orcus {
14 
15 template<typename _Handler>
17 {
18 public:
19  typedef _Handler handler_type;
20 
21  csv_parser(const char* p, size_t n, handler_type& hdl, const csv::parser_config& config);
22  void parse();
23 
24 private:
25 
26  // handlers
27  void row();
28  void cell();
29  void quoted_cell();
30 
31  void parse_cell_with_quote(const char* p0, size_t len0);
32 
36  void push_cell_value(const char* p, size_t n);
37 
38 private:
39  handler_type& m_handler;
40 };
41 
42 template<typename _Handler>
44  const char* p, size_t n, handler_type& hdl, const csv::parser_config& config) :
45  csv::parser_base(p, n, config), m_handler(hdl) {}
46 
47 template<typename _Handler>
49 {
50 #if ORCUS_DEBUG_CSV
51  const char* p = mp_char;
52  for (size_t i = m_pos; i < m_length; ++i, ++p)
53  std::cout << *p;
54  std::cout << std::endl;
55 #endif
56 
57  m_handler.begin_parse();
58  while (has_char())
59  row();
60  m_handler.end_parse();
61 }
62 
63 template<typename _Handler>
65 {
66  m_handler.begin_row();
67  while (true)
68  {
69  if (is_text_qualifier(cur_char()))
70  quoted_cell();
71  else
72  cell();
73 
74  if (!has_char())
75  {
76  m_handler.end_row();
77  return;
78  }
79 
80  char c = cur_char();
81  if (c == '\n')
82  {
83  next();
84 #if ORCUS_DEBUG_CSV
85  cout << "(LF)" << endl;
86 #endif
87  m_handler.end_row();
88  return;
89  }
90 
91  assert(is_delim(c));
92  next();
93 
94  if (m_config.trim_cell_value)
95  skip_blanks();
96  }
97 }
98 
99 template<typename _Handler>
101 {
102  const char* p = mp_char;
103  size_t len = 0;
104  char c = cur_char();
105  while (c != '\n' && !is_delim(c))
106  {
107  ++len;
108  next();
109  if (!has_char())
110  break;
111  c = cur_char();
112  }
113 
114  if (!len)
115  p = NULL;
116 
117  push_cell_value(p, len);
118 }
119 
120 template<typename _Handler>
122 {
123 #if ORCUS_DEBUG_CSV
124  cout << "--- quoted cell" << endl;
125 #endif
126  char c = cur_char();
127  assert(is_text_qualifier(c));
128  next(); // Skip the opening quote.
129  if (!has_char())
130  return;
131 
132  const char* p0 = mp_char;
133  size_t len = 1;
134  for (; has_char(); next(), ++len)
135  {
136  c = cur_char();
137 #if ORCUS_DEBUG_CSV
138  cout << "'" << c << "'" << endl;
139 #endif
140  if (!is_text_qualifier(c))
141  continue;
142 
143  // current char is a quote. Check if the next char is also a text
144  // qualifier.
145 
146  if (has_next() && is_text_qualifier(next_char()))
147  {
148  next();
149  parse_cell_with_quote(p0, len);
150  return;
151  }
152 
153  // Closing quote.
154  m_handler.cell(p0, len-1);
155  next();
156  skip_blanks();
157  return;
158  }
159 
160  // Stream ended prematurely. Handle it gracefully.
161  m_handler.cell(p0, len);
162  next();
163  skip_blanks();
164 }
165 
166 template<typename _Handler>
167 void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
168 {
169 #if ORCUS_DEBUG_CSV
170  using namespace std;
171  cout << "--- parse cell with quote" << endl;
172 #endif
173  assert(is_text_qualifier(cur_char()));
174 
175  // Push the preceding chars to the temp buffer.
176  m_cell_buf.reset();
177  m_cell_buf.append(p0, len0);
178 
179  // Parse the rest, until the closing quote.
180  next();
181  const char* p_cur = mp_char;
182  size_t cur_len = 0;
183  for (; has_char(); next(), ++cur_len)
184  {
185  char c = cur_char();
186 #if ORCUS_DEBUG_CSV
187  cout << "'" << c << "'" << endl;
188 #endif
189  if (!is_text_qualifier(c))
190  continue;
191 
192  if (has_next() && is_text_qualifier(next_char()))
193  {
194  // double quotation. Copy the current segment to the cell buffer.
195  m_cell_buf.append(p_cur, cur_len);
196 
197  next(); // to the 2nd quote.
198  p_cur = mp_char;
199  cur_len = 0;
200  continue;
201  }
202 
203  // closing quote. Flush the current segment to the cell
204  // buffer, push the value to the handler, and exit normally.
205  m_cell_buf.append(p_cur, cur_len);
206 
207  m_handler.cell(m_cell_buf.get(), m_cell_buf.size());
208  next();
209  skip_blanks();
210  return;
211  }
212 
213  // Stream ended prematurely.
214  throw csv::parse_error("stream ended prematurely while parsing quoted cell.");
215 }
216 
217 template<typename _Handler>
218 void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
219 {
220  size_t len = n;
221 
222  if (m_config.trim_cell_value)
223  {
224  // Trim any leading blanks.
225  for (size_t i = 0; i < n; ++i, --len, ++p)
226  {
227  if (!is_blank(*p))
228  break;
229  }
230 
231  // Trim any trailing blanks.
232  if (len)
233  {
234  const char* p_end = p + (len-1);
235  for (; p != p_end; --p_end, --len)
236  {
237  if (!is_blank(*p_end))
238  break;
239  }
240  }
241  }
242 
243  m_handler.cell(p, len);
244 #if ORCUS_DEBUG_CSV
245  if (len)
246  cout << "(cell:'" << std::string(p, len) << "')" << endl;
247  else
248  cout << "(cell:'')" << endl;
249 #endif
250 }
251 
252 }
253 
254 #endif
255 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: csv_parser_base.hpp:51
Definition: csv_parser_base.hpp:42
Definition: csv_parser.hpp:16
Definition: config.hpp:17
bool is_blank(char c) const
Definition: base64.hpp:15
Definition: csv_parser_base.hpp:33