001/****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one   *
003 * or more contributor license agreements.  See the NOTICE file *
004 * distributed with this work for additional information        *
005 * regarding copyright ownership.  The ASF licenses this file   *
006 * to you under the Apache License, Version 2.0 (the            *
007 * "License"); you may not use this file except in compliance   *
008 * with the License.  You may obtain a copy of the License at   *
009 *                                                              *
010 *   http://www.apache.org/licenses/LICENSE-2.0                 *
011 *                                                              *
012 * Unless required by applicable law or agreed to in writing,   *
013 * software distributed under the License is distributed on an  *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015 * KIND, either express or implied.  See the License for the    *
016 * specific language governing permissions and limitations      *
017 * under the License.                                           *
018 ****************************************************************/
019
020package org.apache.james.mime4j.stream;
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.nio.charset.Charset;
027import java.util.LinkedList;
028
029import org.apache.james.mime4j.MimeException;
030import org.apache.james.mime4j.codec.DecodeMonitor;
031import org.apache.james.mime4j.io.LineNumberInputStream;
032import org.apache.james.mime4j.io.LineNumberSource;
033import org.apache.james.mime4j.util.CharsetUtil;
034
035/**
036 * <p>
037 * Parses MIME (or RFC822) message streams of bytes or characters.
038 * The stream is converted into an event stream.
039 * <p>
040 * <p>
041 * Typical usage:
042 * </p>
043 * <pre>
044 *      MimeTokenStream stream = new MimeTokenStream();
045 *      InputStream instream = new FileInputStream("mime.msg");
046 *      try {
047 *          stream.parse(instream);
048 *          for (int state = stream.getState();
049 *              state != MimeTokenStream.T_END_OF_STREAM;
050 *              state = stream.next()) {
051 *              switch (state) {
052 *              case MimeTokenStream.T_BODY:
053 *                  System.out.println("Body detected, contents = "
054 *                  + stream.getInputStream() + ", header data = "
055 *                  + stream.getBodyDescriptor());
056 *                  break;
057 *              case MimeTokenStream.T_FIELD:
058 *                  System.out.println("Header field detected: "
059 *                  + stream.getField());
060 *                  break;
061 *              case MimeTokenStream.T_START_MULTIPART:
062 *                  System.out.println("Multipart message detexted,"
063 *                  + " header data = "
064 *                  + stream.getBodyDescriptor());
065 *              ...
066 *              }
067 *          }
068 *      } finally {
069 *          instream.close();
070 *      }
071 * </pre>
072 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
073 * method {@link #parse(InputStream)} resets the token streams internal
074 * state. However, they are definitely <em>not</em> thread safe. If you
075 * have a multi threaded application, then the suggested use is to have
076 * one instance per thread.</p>
077 */
078public class MimeTokenStream {
079
080    private final MimeConfig config;
081    private final DecodeMonitor monitor;
082    private final FieldBuilder fieldBuilder;
083    private final BodyDescriptorBuilder bodyDescBuilder;
084    private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
085
086    private EntityState state = EntityState.T_END_OF_STREAM;
087    private EntityStateMachine currentStateMachine;
088    private RecursionMode recursionMode = RecursionMode.M_RECURSE;
089    private MimeEntity rootentity;
090
091    /**
092     * Constructs a standard (lax) stream.
093     * Optional validation events will be logged only.
094     * Use {@link MimeConfig#setStrictParsing(boolean)} to turn on strict
095     * parsing mode and pass the config object to
096     * {@link MimeTokenStream#MimeTokenStream(MimeConfig)} to create
097     * a stream that strictly validates the input.
098     */
099    public MimeTokenStream() {
100        this(null);
101    }
102
103    public MimeTokenStream(final MimeConfig config) {
104        this(config, null, null, null);
105    }
106
107    public MimeTokenStream(
108            final MimeConfig config,
109            final BodyDescriptorBuilder bodyDescBuilder) {
110        this(config, null, null, bodyDescBuilder);
111    }
112
113    public MimeTokenStream(
114            final MimeConfig config,
115            final DecodeMonitor monitor,
116            final BodyDescriptorBuilder bodyDescBuilder) {
117        this(config, monitor, null, bodyDescBuilder);
118    }
119
120    public MimeTokenStream(
121            final MimeConfig config,
122            final DecodeMonitor monitor,
123            final FieldBuilder fieldBuilder,
124            final BodyDescriptorBuilder bodyDescBuilder) {
125        super();
126        this.config = config != null ? config : new MimeConfig();
127        this.fieldBuilder = fieldBuilder != null ? fieldBuilder :
128            new DefaultFieldBuilder(this.config.getMaxHeaderLen());
129        this.monitor = monitor != null ? monitor :
130            (this.config.isStrictParsing() ? DecodeMonitor.STRICT : DecodeMonitor.SILENT);
131        this.bodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder :
132            new FallbackBodyDescriptorBuilder();
133    }
134
135    /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
136     * If the {@code MimeTokenStream} has already been in use, resets the streams
137     * internal state.
138     */
139    public void parse(InputStream stream) {
140        doParse(stream, EntityState.T_START_MESSAGE);
141    }
142
143    /**
144     * <p>Instructs the {@code MimeTokenStream} to parse the given content with
145     * the content type. The message stream is assumed to have no message header
146     * and is expected to begin with a message body. This can be the case when
147     * the message content is transmitted using a different transport protocol
148     * such as HTTP.</p>
149     * <p>If the {@code MimeTokenStream} has already been in use, resets the
150     * streams internal state.</p>
151     * @return a parsed Field representing the input contentType
152     */
153    public Field parseHeadless(InputStream stream, String contentType) {
154        if (contentType == null) {
155            throw new IllegalArgumentException("Content type may not be null");
156        }
157        Field newContentType;
158        try {
159            RawField rawContentType = new RawField("Content-Type", contentType);
160            newContentType = bodyDescBuilder.addField(rawContentType);
161            if (newContentType == null) newContentType = rawContentType;
162        } catch (MimeException ex) {
163            // should never happen
164            throw new IllegalArgumentException(ex.getMessage());
165        }
166
167        doParse(stream, EntityState.T_END_HEADER);
168        try {
169            next();
170        } catch (IOException e) {
171            // Should never happend: the first next after END_HEADER does not produce IO
172            throw new IllegalStateException(e);
173        } catch (MimeException e) {
174            // This should never happen
175            throw new IllegalStateException(e);
176        }
177        return newContentType;
178    }
179
180    private void doParse(InputStream stream, EntityState start) {
181        LineNumberSource lineSource = null;
182        if (config.isCountLineNumbers()) {
183            LineNumberInputStream lineInput = new LineNumberInputStream(stream);
184            lineSource = lineInput;
185            stream = lineInput;
186        }
187
188        rootentity = new MimeEntity(
189                lineSource,
190                stream,
191                config,
192                start,
193                EntityState.T_END_MESSAGE,
194                monitor,
195                fieldBuilder,
196                bodyDescBuilder);
197
198        rootentity.setRecursionMode(recursionMode);
199        currentStateMachine = rootentity;
200        entities.clear();
201        entities.add(currentStateMachine);
202        state = currentStateMachine.getState();
203    }
204
205    /**
206     * Determines if this parser is currently in raw mode.
207     *
208     * @return <code>true</code> if in raw mode, <code>false</code>
209     *         otherwise.
210     * @see #setRecursionMode(RecursionMode)
211     */
212    public boolean isRaw() {
213        return recursionMode == RecursionMode.M_RAW;
214    }
215
216    /**
217     * Gets the current recursion mode.
218     * The recursion mode specifies the approach taken to parsing parts.
219     * {@link RecursionMode#M_RAW}  mode does not parse the part at all.
220     * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
221     * when an <code>message/rfc822</code> part is encountered;
222     * {@link RecursionMode#M_NO_RECURSE} does not.
223     * @return {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
224     *   {@link RecursionMode#M_NO_RECURSE}
225     */
226    public RecursionMode getRecursionMode() {
227        return recursionMode;
228    }
229
230    /**
231     * Sets the current recursion.
232     * The recursion mode specifies the approach taken to parsing parts.
233     * {@link RecursionMode#M_RAW}  mode does not parse the part at all.
234     * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
235     * when an <code>message/rfc822</code> part is encountered;
236     * {@link RecursionMode#M_NO_RECURSE} does not.
237     * @param mode {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
238     *   {@link RecursionMode#M_NO_RECURSE}
239     */
240    public void setRecursionMode(RecursionMode mode) {
241        recursionMode = mode;
242        if (currentStateMachine != null) {
243            currentStateMachine.setRecursionMode(mode);
244        }
245    }
246
247    /**
248     * Finishes the parsing and stops reading lines.
249     * NOTE: No more lines will be parsed but the parser
250     * will still trigger 'end' events to match previously
251     * triggered 'start' events.
252     */
253    public void stop() {
254        rootentity.stop();
255    }
256
257    /**
258     * Returns the current state.
259     */
260    public EntityState getState() {
261        return state;
262    }
263
264    /**
265     * This method returns the raw entity, preamble, or epilogue contents.
266     * <p/>
267     * This method is valid, if {@link #getState()} returns either of
268     * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
269     * {@link EntityState#T_EPILOGUE}.
270     *
271     * @return Data stream, depending on the current state.
272     * @throws IllegalStateException {@link #getState()} returns an
273     *   invalid value.
274     */
275    public InputStream getInputStream() {
276        return currentStateMachine.getContentStream();
277    }
278
279    /**
280     * This method returns a transfer decoded stream based on the MIME
281     * fields with the standard defaults.
282     * <p/>
283     * This method is valid, if {@link #getState()} returns either of
284     * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
285     * {@link EntityState#T_EPILOGUE}.
286     *
287     * @return Data stream, depending on the current state.
288     * @throws IllegalStateException {@link #getState()} returns an
289     *   invalid value.
290     */
291    public InputStream getDecodedInputStream() {
292        return currentStateMachine.getDecodedContentStream();
293    }
294
295    /**
296     * Gets a reader configured for the current body or body part.
297     * The reader will return a transfer and charset decoded
298     * stream of characters based on the MIME fields with the standard
299     * defaults.
300     * This is a conveniance method and relies on {@link #getInputStream()}.
301     * Consult the javadoc for that method for known limitations.
302     *
303     * @return <code>Reader</code>, not null
304     * @see #getInputStream
305     * @throws IllegalStateException {@link #getState()} returns an
306     *   invalid value
307     * @throws UnsupportedCharsetException if there is no JVM support
308     * for decoding the charset
309     * @throws IllegalCharsetNameException if the charset name specified
310     * in the mime type is illegal
311     */
312    public Reader getReader() {
313        final BodyDescriptor bodyDescriptor = getBodyDescriptor();
314        final String mimeCharset = bodyDescriptor.getCharset();
315        final Charset charset;
316        if (mimeCharset == null || "".equals(mimeCharset)) {
317            charset = CharsetUtil.US_ASCII;
318        } else {
319            charset = Charset.forName(mimeCharset);
320        }
321        final InputStream instream = getDecodedInputStream();
322        return new InputStreamReader(instream, charset);
323    }
324
325    /**
326     * <p>Gets a descriptor for the current entity.
327     * This method is valid if {@link #getState()} returns:</p>
328     * <ul>
329     * <li>{@link EntityState#T_BODY}</li>
330     * <li>{@link EntityState#T_START_MULTIPART}</li>
331     * <li>{@link EntityState#T_EPILOGUE}</li>
332     * <li>{@link EntityState#T_PREAMBLE}</li>
333     * </ul>
334     * @return <code>BodyDescriptor</code>, not nulls
335     */
336    public BodyDescriptor getBodyDescriptor() {
337        return currentStateMachine.getBodyDescriptor();
338    }
339
340    /**
341     * This method is valid, if {@link #getState()} returns {@link EntityState#T_FIELD}.
342     * @return String with the fields raw contents.
343     * @throws IllegalStateException {@link #getState()} returns another
344     *   value than {@link EntityState#T_FIELD}.
345     */
346    public Field getField() {
347        return currentStateMachine.getField();
348    }
349
350    /**
351     * This method advances the token stream to the next token.
352     * @throws IllegalStateException The method has been called, although
353     *   {@link #getState()} was already {@link EntityState#T_END_OF_STREAM}.
354     */
355    public EntityState next() throws IOException, MimeException {
356        if (state == EntityState.T_END_OF_STREAM  ||  currentStateMachine == null) {
357            throw new IllegalStateException("No more tokens are available.");
358        }
359        while (currentStateMachine != null) {
360            EntityStateMachine next = currentStateMachine.advance();
361            if (next != null) {
362                entities.add(next);
363                currentStateMachine = next;
364            }
365            state = currentStateMachine.getState();
366            if (state != EntityState.T_END_OF_STREAM) {
367                return state;
368            }
369            entities.removeLast();
370            if (entities.isEmpty()) {
371                currentStateMachine = null;
372            } else {
373                currentStateMachine = entities.getLast();
374                currentStateMachine.setRecursionMode(recursionMode);
375            }
376        }
377        state = EntityState.T_END_OF_STREAM;
378        return state;
379    }
380
381    /**
382     * Renders a state as a string suitable for logging.
383     * @param state
384     * @return rendered as string, not null
385     */
386    public static final String stateToString(EntityState state) {
387        return MimeEntity.stateToString(state);
388    }
389
390
391    public MimeConfig getConfig() {
392        return config;
393    }
394}