001/**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020package org.apache.james.mime4j.stream; 021 022import java.util.ArrayList; 023import java.util.BitSet; 024import java.util.List; 025 026import org.apache.james.mime4j.MimeException; 027import org.apache.james.mime4j.util.ByteSequence; 028import org.apache.james.mime4j.util.CharsetUtil; 029import org.apache.james.mime4j.util.ContentUtil; 030 031/** 032 * Low level parser for header field elements. The parsing routines of this class are designed 033 * to produce near zero intermediate garbage and make no intermediate copies of input data. 034 * <p/> 035 * This class is immutable and thread safe. 036 */ 037public class RawFieldParser { 038 039 public static BitSet INIT_BITSET(int ... b) { 040 BitSet bitset = new BitSet(b.length); 041 for (int i = 0; i < b.length; i++) { 042 bitset.set(b[i]); 043 } 044 return bitset; 045 } 046 047 static final BitSet COLON = INIT_BITSET(':'); 048 static final BitSet EQUAL_OR_SEMICOLON = INIT_BITSET('=', ';'); 049 static final BitSet SEMICOLON = INIT_BITSET(';'); 050 051 public static final RawFieldParser DEFAULT = new RawFieldParser(); 052 053 /** 054 * Parses the sequence of bytes into {@link RawField}. 055 * 056 * @throws MimeException if the input data does not contain a valid MIME field. 057 */ 058 public RawField parseField(final ByteSequence raw) throws MimeException { 059 if (raw == null) { 060 return null; 061 } 062 ParserCursor cursor = new ParserCursor(0, raw.length()); 063 String name = parseToken(raw, cursor, COLON); 064 if (cursor.atEnd()) { 065 throw new MimeException("Invalid MIME field: no name/value separator found: " + 066 raw.toString()); 067 } 068 return new RawField(raw, cursor.getPos(), name, null); 069 } 070 071 /** 072 * Parses the field body containing a value with parameters into {@link RawBody}. 073 * 074 * @param field unstructured (raw) field 075 */ 076 public RawBody parseRawBody(final RawField field) { 077 ByteSequence buf = field.getRaw(); 078 int pos = field.getDelimiterIdx() + 1; 079 if (buf == null) { 080 String body = field.getBody(); 081 if (body == null) { 082 return new RawBody("", null); 083 } 084 buf = ContentUtil.encode(body); 085 pos = 0; 086 } 087 ParserCursor cursor = new ParserCursor(pos, buf.length()); 088 return parseRawBody(buf, cursor); 089 } 090 091 /** 092 * Parses the sequence of bytes containing a value with parameters into {@link RawBody}. 093 * 094 * @param buf buffer with the sequence of bytes to be parsed 095 * @param cursor defines the bounds and current position of the buffer 096 */ 097 public RawBody parseRawBody(final ByteSequence buf, final ParserCursor cursor) { 098 String value = parseToken(buf, cursor, SEMICOLON); 099 if (cursor.atEnd()) { 100 return new RawBody(value, new ArrayList<NameValuePair>()); 101 } 102 cursor.updatePos(cursor.getPos() + 1); 103 List<NameValuePair> params = parseParameters(buf, cursor); 104 return new RawBody(value, params); 105 } 106 107 /** 108 * Parses the sequence of bytes containing field parameters delimited with semicolon into 109 * a list of {@link NameValuePair}s. 110 * 111 * @param buf buffer with the sequence of bytes to be parsed 112 * @param cursor defines the bounds and current position of the buffer 113 */ 114 public List<NameValuePair> parseParameters(final ByteSequence buf, final ParserCursor cursor) { 115 List<NameValuePair> params = new ArrayList<NameValuePair>(); 116 skipWhiteSpace(buf, cursor); 117 while (!cursor.atEnd()) { 118 NameValuePair param = parseParameter(buf, cursor); 119 params.add(param); 120 } 121 return params; 122 } 123 124 /** 125 * Parses the sequence of bytes containing a field parameter delimited with semicolon into 126 * {@link NameValuePair}. 127 * 128 * @param buf buffer with the sequence of bytes to be parsed 129 * @param cursor defines the bounds and current position of the buffer 130 */ 131 public NameValuePair parseParameter(final ByteSequence buf, final ParserCursor cursor) { 132 String name = parseToken(buf, cursor, EQUAL_OR_SEMICOLON); 133 if (cursor.atEnd()) { 134 return new NameValuePair(name, null); 135 } 136 int delim = buf.byteAt(cursor.getPos()); 137 cursor.updatePos(cursor.getPos() + 1); 138 if (delim == ';') { 139 return new NameValuePair(name, null); 140 } 141 String value = parseValue(buf, cursor, SEMICOLON); 142 if (!cursor.atEnd()) { 143 cursor.updatePos(cursor.getPos() + 1); 144 } 145 return new NameValuePair(name, value); 146 } 147 148 /** 149 * Extracts from the sequence of bytes a token terminated with any of the given delimiters 150 * discarding semantically insignificant whitespace characters and comments. 151 * 152 * @param buf buffer with the sequence of bytes to be parsed 153 * @param cursor defines the bounds and current position of the buffer 154 * @param delimiters set of delimiting characters. Can be <code>null</code> if the token 155 * is not delimited by any character. 156 */ 157 public String parseToken(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) { 158 StringBuilder dst = new StringBuilder(); 159 boolean whitespace = false; 160 while (!cursor.atEnd()) { 161 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff); 162 if (delimiters != null && delimiters.get(current)) { 163 break; 164 } else if (CharsetUtil.isWhitespace(current)) { 165 skipWhiteSpace(buf, cursor); 166 whitespace = true; 167 } else if (current == '(') { 168 skipComment(buf, cursor); 169 } else { 170 if (dst.length() > 0 && whitespace) { 171 dst.append(' '); 172 } 173 copyContent(buf, cursor, delimiters, dst); 174 whitespace = false; 175 } 176 } 177 return dst.toString(); 178 } 179 180 /** 181 * Extracts from the sequence of bytes a value which can be enclosed in quote marks and 182 * terminated with any of the given delimiters discarding semantically insignificant 183 * whitespace characters and comments. 184 * 185 * @param buf buffer with the sequence of bytes to be parsed 186 * @param cursor defines the bounds and current position of the buffer 187 * @param delimiters set of delimiting characters. Can be <code>null</code> if the value 188 * is not delimited by any character. 189 */ 190 public String parseValue(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) { 191 StringBuilder dst = new StringBuilder(); 192 boolean whitespace = false; 193 while (!cursor.atEnd()) { 194 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff); 195 if (delimiters != null && delimiters.get(current)) { 196 break; 197 } else if (CharsetUtil.isWhitespace(current)) { 198 skipWhiteSpace(buf, cursor); 199 whitespace = true; 200 } else if (current == '(') { 201 skipComment(buf, cursor); 202 } else if (current == '\"') { 203 if (dst.length() > 0 && whitespace) { 204 dst.append(' '); 205 } 206 copyQuotedContent(buf, cursor, dst); 207 whitespace = false; 208 } else { 209 if (dst.length() > 0 && whitespace) { 210 dst.append(' '); 211 } 212 copyContent(buf, cursor, delimiters, dst); 213 whitespace = false; 214 } 215 } 216 return dst.toString(); 217 } 218 219 /** 220 * Skips semantically insignificant whitespace characters and moves the cursor to the closest 221 * non-whitespace character. 222 * 223 * @param buf buffer with the sequence of bytes to be parsed 224 * @param cursor defines the bounds and current position of the buffer 225 */ 226 public void skipWhiteSpace(final ByteSequence buf, final ParserCursor cursor) { 227 int pos = cursor.getPos(); 228 int indexFrom = cursor.getPos(); 229 int indexTo = cursor.getUpperBound(); 230 for (int i = indexFrom; i < indexTo; i++) { 231 char current = (char) (buf.byteAt(i) & 0xff); 232 if (!CharsetUtil.isWhitespace(current)) { 233 break; 234 } else { 235 pos++; 236 } 237 } 238 cursor.updatePos(pos); 239 } 240 241 /** 242 * Skips semantically insignificant content if the current position is positioned at the 243 * beginning of a comment and moves the cursor past the end of the comment. 244 * Nested comments and escaped characters are recognized and handled appropriately. 245 * 246 * @param buf buffer with the sequence of bytes to be parsed 247 * @param cursor defines the bounds and current position of the buffer 248 */ 249 public void skipComment(final ByteSequence buf, final ParserCursor cursor) { 250 if (cursor.atEnd()) { 251 return; 252 } 253 int pos = cursor.getPos(); 254 int indexFrom = cursor.getPos(); 255 int indexTo = cursor.getUpperBound(); 256 char current = (char) (buf.byteAt(pos) & 0xff); 257 if (current != '(') { 258 return; 259 } 260 pos++; 261 indexFrom++; 262 263 int level = 1; 264 boolean escaped = false; 265 for (int i = indexFrom; i < indexTo; i++, pos++) { 266 current = (char) (buf.byteAt(i) & 0xff); 267 if (escaped) { 268 escaped = false; 269 } else { 270 if (current == '\\') { 271 escaped = true; 272 } else if (current == '(') { 273 level++; 274 } else if (current == ')') { 275 level--; 276 } 277 } 278 if (level <= 0) { 279 pos++; 280 break; 281 } 282 } 283 cursor.updatePos(pos); 284 } 285 286 /** 287 * Skips semantically insignificant whitespace characters and comments and moves the cursor 288 * to the closest semantically significant non-whitespace character. 289 * Nested comments and escaped characters are recognized and handled appropriately. 290 * 291 * @param buf buffer with the sequence of bytes to be parsed 292 * @param cursor defines the bounds and current position of the buffer 293 */ 294 public void skipAllWhiteSpace(final ByteSequence buf, final ParserCursor cursor) { 295 while (!cursor.atEnd()) { 296 char current = (char) (buf.byteAt(cursor.getPos()) & 0xff); 297 if (CharsetUtil.isWhitespace(current)) { 298 skipWhiteSpace(buf, cursor); 299 } else if (current == '(') { 300 skipComment(buf, cursor); 301 } else { 302 break; 303 } 304 } 305 } 306 307 /** 308 * Transfers content into the destination buffer until a whitespace character, a comment, 309 * or any of the given delimiters is encountered. 310 * 311 * @param buf buffer with the sequence of bytes to be parsed 312 * @param cursor defines the bounds and current position of the buffer 313 * @param delimiters set of delimiting characters. Can be <code>null</code> if the value 314 * is delimited by a whitespace or a comment only. 315 * @param dst destination buffer 316 */ 317 public void copyContent(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters, 318 final StringBuilder dst) { 319 int pos = cursor.getPos(); 320 int indexFrom = cursor.getPos(); 321 int indexTo = cursor.getUpperBound(); 322 for (int i = indexFrom; i < indexTo; i++) { 323 char current = (char) (buf.byteAt(i) & 0xff); 324 if ((delimiters != null && delimiters.get(current)) 325 || CharsetUtil.isWhitespace(current) || current == '(') { 326 break; 327 } else { 328 pos++; 329 dst.append(current); 330 } 331 } 332 cursor.updatePos(pos); 333 } 334 335 /** 336 * Transfers content enclosed with quote marks into the destination buffer. 337 * 338 * @param buf buffer with the sequence of bytes to be parsed 339 * @param cursor defines the bounds and current position of the buffer 340 * @param dst destination buffer 341 */ 342 public void copyQuotedContent(final ByteSequence buf, final ParserCursor cursor, 343 final StringBuilder dst) { 344 if (cursor.atEnd()) { 345 return; 346 } 347 int pos = cursor.getPos(); 348 int indexFrom = cursor.getPos(); 349 int indexTo = cursor.getUpperBound(); 350 char current = (char) (buf.byteAt(pos) & 0xff); 351 if (current != '\"') { 352 return; 353 } 354 pos++; 355 indexFrom++; 356 boolean escaped = false; 357 for (int i = indexFrom; i < indexTo; i++, pos++) { 358 current = (char) (buf.byteAt(i) & 0xff); 359 if (escaped) { 360 if (current != '\"' && current != '\\') { 361 dst.append('\\'); 362 } 363 dst.append(current); 364 escaped = false; 365 } else { 366 if (current == '\"') { 367 pos++; 368 break; 369 } 370 if (current == '\\') { 371 escaped = true; 372 } else if (current != '\r' && current != '\n') { 373 dst.append(current); 374 } 375 } 376 } 377 cursor.updatePos(pos); 378 } 379 380}