{-# OPTIONS_HADDOCK hide #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Language.Haskell.Exts.Annotated.Lexer
-- Copyright   :  (c) The GHC Team, 1997-2000
--                (c) Niklas Broberg, 2004-2009
-- License     :  BSD-style (see the file LICENSE.txt)
--
-- Maintainer  :  Niklas Broberg, d00nibro@chalmers.se
-- Stability   :  stable
-- Portability :  portable
--
-- Lexer for Haskell, with some extensions.
--
-----------------------------------------------------------------------------

-- ToDo: Introduce different tokens for decimal, octal and hexadecimal (?)
-- ToDo: FloatTok should have three parts (integer part, fraction, exponent) (?)
-- ToDo: Use a lexical analyser generator (lx?)

module Language.Haskell.Exts.Lexer (Token(..), showToken, lexer) where

import Language.Haskell.Exts.ParseMonad
import Language.Haskell.Exts.SrcLoc
import Language.Haskell.Exts.Comments
import Language.Haskell.Exts.Extension
import Language.Haskell.Exts.ExtScheme

import Data.Char
import Data.Ratio
import Data.List (intersperse)
import Control.Monad (when)

-- import Debug.Trace (trace)

data Token
        = VarId String
        | QVarId (String,String)
        | IDupVarId (String)        -- duplicable implicit parameter
        | ILinVarId (String)        -- linear implicit parameter
        | ConId String
        | QConId (String,String)
        | DVarId [String]       -- to enable varid's with '-' in them
        | VarSym String
        | ConSym String
        | QVarSym (String,String)
        | QConSym (String,String)
        | IntTok (Integer, String)
        | FloatTok (Rational, String)
        | Character (Char, String)
        | StringTok (String, String)
        | IntTokHash (Integer, String)        -- 1#
        | WordTokHash (Integer, String)       -- 1##
        | FloatTokHash (Rational, String)     -- 1.0#
        | DoubleTokHash (Rational, String)    -- 1.0##
        | CharacterHash (Char, String)        -- c#
        | StringHash (String, String)         -- "Hello world!"#

-- Symbols

        | LeftParen
        | RightParen
        | LeftHashParen
        | RightHashParen
        | LeftCurlyBar
        | RightCurlyBar
        | SemiColon
        | LeftCurly
        | RightCurly
        | VRightCurly           -- a virtual close brace
        | LeftSquare
        | RightSquare
        | Comma
        | Underscore
        | BackQuote

-- Reserved operators

        | Dot           -- reserved for use with 'forall x . x'
        | DotDot
        | Colon
        | DoubleColon
        | Equals
        | Backslash
        | Bar
        | LeftArrow
        | RightArrow
        | At
        | Tilde
        | DoubleArrow
        | Minus
        | Exclamation
        | Star
        | LeftArrowTail         -- >-
        | RightArrowTail        -- -<
        | LeftDblArrowTail      -- >>-
        | RightDblArrowTail     -- -<<

-- Template Haskell
        | THExpQuote            -- [| or [e|
        | THPatQuote            -- [p|
        | THDecQuote            -- [d|
        | THTypQuote            -- [t|
        | THCloseQuote          -- |]
        | THIdEscape (String)   -- dollar x
        | THParenEscape         -- dollar (
        | THVarQuote            -- 'x (but without the x)
        | THTyQuote             -- ''T (but without the T)
        | THQuasiQuote (String,String)  -- [$...|...]

-- HaRP
        | RPGuardOpen       -- (|
        | RPGuardClose      -- |)
        | RPCAt             -- @:

-- Hsx
        | XCodeTagOpen      -- <%
        | XCodeTagClose     -- %>
        | XStdTagOpen       -- <
        | XStdTagClose      -- >
        | XCloseTagOpen     -- </
        | XEmptyTagClose    -- />
        | XChildTagOpen     -- <%> (note that close doesn't exist, it's XCloseTagOpen followed by XCodeTagClose)
        | XPCDATA String
        | XRPatOpen             -- <[
        | XRPatClose            -- ]>

-- Pragmas

        | PragmaEnd                     -- #-}
        | RULES
        | INLINE Bool
        | INLINE_CONLIKE
        | SPECIALISE
        | SPECIALISE_INLINE Bool
        | SOURCE
        | DEPRECATED
        | WARNING
        | SCC
        | GENERATED
        | CORE
        | UNPACK
        | OPTIONS (Maybe String,String)
--        | CFILES  String
--        | INCLUDE String
        | LANGUAGE
        | ANN

-- Reserved Ids

        | KW_As
        | KW_By         -- transform list comprehensions
        | KW_Case
        | KW_Class
        | KW_Data
        | KW_Default
        | KW_Deriving
        | KW_Do
        | KW_MDo
        | KW_Else
        | KW_Family     -- indexed type families
        | KW_Forall     -- universal/existential types
        | KW_Group      -- transform list comprehensions
        | KW_Hiding
        | KW_If
        | KW_Import
        | KW_In
        | KW_Infix
        | KW_InfixL
        | KW_InfixR
        | KW_Instance
        | KW_Let
        | KW_Module
        | KW_NewType
        | KW_Of
        | KW_Proc       -- arrows
        | KW_Rec        -- arrows
        | KW_Then
        | KW_Type
        | KW_Using      -- transform list comprehensions
        | KW_Where
        | KW_Qualified

                -- FFI
        | KW_Foreign
        | KW_Export
        | KW_Safe
        | KW_Unsafe
        | KW_Threadsafe
        | KW_StdCall
        | KW_CCall

        | EOF
        deriving (Eq,Show)

reserved_ops :: [(String,(Token, Maybe ExtScheme))]
reserved_ops = [
 ( "..", (DotDot,       Nothing) ),
 ( ":",  (Colon,        Nothing) ),
 ( "::", (DoubleColon,  Nothing) ),
 ( "=",  (Equals,       Nothing) ),
 ( "\\", (Backslash,    Nothing) ),
 ( "|",  (Bar,          Nothing) ),
 ( "<-", (LeftArrow,    Nothing) ),
 ( "->", (RightArrow,   Nothing) ),
 ( "@",  (At,           Nothing) ),
 ( "@:", (RPCAt,        Just (Any [RegularPatterns])) ),
 ( "~",  (Tilde,        Nothing) ),
 ( "=>", (DoubleArrow,  Nothing) ),
 ( "*",  (Star,         Just (Any [KindSignatures])) ),
 -- Arrows notation
 ( "-<",  (LeftArrowTail,       Just (Any [Arrows])) ),
 ( ">-",  (RightArrowTail,      Just (Any [Arrows])) ),
 ( "-<<", (LeftDblArrowTail,    Just (Any [Arrows])) ),
 ( ">>-", (RightDblArrowTail,   Just (Any [Arrows])) ),
 -- Unicode notation
 ( "\x2190",    (LeftArrow,     Just (Any  [UnicodeSyntax])) ),
 ( "\x2192",    (RightArrow,    Just (Any  [UnicodeSyntax])) ),
 ( "\x21d2",    (DoubleArrow,   Just (Any  [UnicodeSyntax])) ),
 ( "\x2237",    (DoubleColon,   Just (Any  [UnicodeSyntax])) ),
 ( "\x2919",    (LeftArrowTail,     Just (All [UnicodeSyntax, Arrows])) ),
 ( "\x291a",    (RightArrowTail,    Just (All [UnicodeSyntax, Arrows])) ),
 ( "\x291b",    (LeftDblArrowTail,  Just (All [UnicodeSyntax, Arrows])) ),
 ( "\x291c",    (RightDblArrowTail, Just (All [UnicodeSyntax, Arrows])) ),
 ( "\x2605",    (Star,              Just (All [UnicodeSyntax, KindSignatures])) )
 ]

special_varops :: [(String,(Token, Maybe ExtScheme))]
special_varops = [
 -- the dot is only a special symbol together with forall, but can still be used as function composition
 ( ".",  (Dot,          Just (Any [ExplicitForall, ExistentialQuantification])) ),
 ( "-",  (Minus,        Nothing) ),
 ( "!",  (Exclamation,  Nothing) )
 ]

reserved_ids :: [(String,(Token, Maybe ExtScheme))]
reserved_ids = [
 ( "_",         (Underscore,    Nothing) ),
 ( "by",        (KW_By,         Just (Any [TransformListComp])) ),
 ( "case",      (KW_Case,       Nothing) ),
 ( "class",     (KW_Class,      Nothing) ),
 ( "data",      (KW_Data,       Nothing) ),
 ( "default",   (KW_Default,    Nothing) ),
 ( "deriving",  (KW_Deriving,   Nothing) ),
 ( "do",        (KW_Do,         Nothing) ),
 ( "else",      (KW_Else,       Nothing) ),
 ( "family",    (KW_Family,     Just (Any [TypeFamilies])) ),        -- indexed type families
 ( "forall",    (KW_Forall,     Just (Any [ExplicitForall, ExistentialQuantification])) ),    -- universal/existential quantification
 ( "group",     (KW_Group,      Just (Any [TransformListComp])) ),
 ( "if",        (KW_If,         Nothing) ),
 ( "import",    (KW_Import,     Nothing) ),
 ( "in",        (KW_In,         Nothing) ),
 ( "infix",     (KW_Infix,      Nothing) ),
 ( "infixl",    (KW_InfixL,     Nothing) ),
 ( "infixr",    (KW_InfixR,     Nothing) ),
 ( "instance",  (KW_Instance,   Nothing) ),
 ( "let",       (KW_Let,        Nothing) ),
 ( "mdo",       (KW_MDo,        Just (Any [RecursiveDo])) ),
 ( "module",    (KW_Module,     Nothing) ),
 ( "newtype",   (KW_NewType,    Nothing) ),
 ( "of",        (KW_Of,         Nothing) ),
 ( "proc",      (KW_Proc,       Just (Any [Arrows])) ),
 ( "rec",       (KW_Rec,        Just (Any [Arrows])) ),
 ( "then",      (KW_Then,       Nothing) ),
 ( "type",      (KW_Type,       Nothing) ),
 ( "using",     (KW_Using,      Just (Any [TransformListComp])) ),
 ( "where",     (KW_Where,      Nothing) ),

-- FFI
 ( "foreign",   (KW_Foreign,    Just (Any [ForeignFunctionInterface])) ),

-- Unicode
 ( "\x2200",    (KW_Forall,     Just (All [UnicodeSyntax, ExplicitForall])) )
 ]


special_varids :: [(String,(Token, Maybe ExtScheme))]
special_varids = [
 ( "as",        (KW_As,         Nothing) ),
 ( "qualified", (KW_Qualified,  Nothing) ),
 ( "hiding",    (KW_Hiding,     Nothing) ),

-- FFI
 ( "export",     (KW_Export,        Just (Any [ForeignFunctionInterface])) ),
 ( "safe",       (KW_Safe,          Just (Any [ForeignFunctionInterface])) ),
 ( "unsafe",     (KW_Unsafe,        Just (Any [ForeignFunctionInterface])) ),
 ( "threadsafe", (KW_Threadsafe,    Just (Any [ForeignFunctionInterface])) ),
 ( "stdcall",    (KW_StdCall,       Just (Any [ForeignFunctionInterface])) ),
 ( "ccall",      (KW_CCall,         Just (Any [ForeignFunctionInterface])) )
 ]

pragmas :: [(String,Token)]
pragmas = [
 ( "rules",             RULES           ),
 ( "inline",            INLINE True     ),
 ( "noinline",          INLINE False    ),
 ( "notinline",         INLINE False    ),
 ( "specialise",        SPECIALISE      ),
 ( "specialize",        SPECIALISE      ),
 ( "source",            SOURCE          ),
 ( "deprecated",        DEPRECATED      ),
 ( "warning",           WARNING         ),
 ( "ann",               ANN             ),
 ( "scc",               SCC             ),
 ( "generated",         GENERATED       ),
 ( "core",              CORE            ),
 ( "unpack",            UNPACK          ),
 ( "language",          LANGUAGE        ),
 ( "options",           OPTIONS undefined ) -- we'll tweak it before use - promise!
-- ( "cfiles",            CFILES  undefined ), -- same here...
-- ( "include",           INCLUDE undefined )  -- ...and here!
 ]

isIdent, isHSymbol :: Char -> Bool
isIdent   c = isAlpha c || isDigit c || c == '\'' || c == '_'

isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'"))

matchChar :: Char -> String -> Lex a ()
matchChar c msg = do
    s <- getInput
    if null s || head s /= c then fail msg else discard 1

-- The top-level lexer.
-- We need to know whether we are at the beginning of the line to decide
-- whether to insert layout tokens.

lexer :: (Loc Token -> P a) -> P a
lexer = runL topLexer

topLexer :: Lex a (Loc Token)
topLexer = do
    b <- pullCtxtFlag
    if b then -- trace (show cf ++ ": " ++ show VRightCurly) $
              setBOL >> getSrcLocL >>= \l -> return (Loc (mkSrcSpan l l) VRightCurly) -- the lex context state flags that we must do an empty {} - UGLY
     else do
        bol <- checkBOL
        (bol, ws) <- lexWhiteSpace bol
        -- take care of whitespace in PCDATA
        ec <- getExtContext
        case ec of
         -- if there was no linebreak, and we are lexing PCDATA,
         -- then we want to care about the whitespace.
         -- We don't bother to test for XmlSyntax, since we
         -- couldn't end up in ChildCtxt otherwise.
         Just ChildCtxt | not bol && ws -> getSrcLocL >>= \l -> return $ Loc (mkSrcSpan l l) $ XPCDATA " "
         _ -> do startToken
                 sl <- getSrcLocL
                 t <- if bol then lexBOL    -- >>= \t -> trace ("BOL: " ++ show t) (return t)
                             else lexToken  -- >>= \t -> trace (show t) (return t)
                 el <- getSrcLocL
                 return $ Loc (mkSrcSpan sl el) t

lexWhiteSpace :: Bool -> Lex a (Bool, Bool)
lexWhiteSpace bol = do
    s <- getInput
    ignL <- ignoreLinePragmasL
    case s of
        -- If we find a recognised pragma, we don't want to treat it as a comment.
        '{':'-':'#':rest | isRecognisedPragma rest -> return (bol, False)
                         | isLinePragma rest && not ignL -> do 
                            (l, fn) <- lexLinePragma
                            setSrcLineL l
                            setLineFilenameL fn
                            lexWhiteSpace True
        '{':'-':_ -> do
            loc <- getSrcLocL
            discard 2
            (bol, c) <- lexNestedComment bol ""
            loc2 <- getSrcLocL
            pushComment $ Comment True (mkSrcSpan loc loc2) (reverse c)
            (bol, _) <- lexWhiteSpace bol
            return (bol, True)
        '-':'-':s | all (== '-') (takeWhile isHSymbol s) -> do
            loc    <- getSrcLocL
            discard 2
            dashes <- lexWhile (== '-')
            rest   <- lexWhile (/= '\n')
            s' <- getInput
            loc2 <- getSrcLocL
            let com = Comment False (mkSrcSpan loc loc2) $ dashes ++ rest
            case s' of
                [] -> pushComment com >> return (False, True)
                _ -> do
                    pushComment com
                    lexNewline
                    lexWhiteSpace True
                    return (True, True)
        '\n':_ -> do
            lexNewline
            lexWhiteSpace True
            return (True, True)
        '\t':_ -> do
            lexTab
            (bol, _) <- lexWhiteSpace bol
            return (bol, True)
        c:_ | isSpace c -> do
            discard 1
            (bol, _) <- lexWhiteSpace bol
            return (bol, True)
        _ -> return (bol, False)

isRecognisedPragma, isLinePragma :: String -> Bool
isRecognisedPragma str = let pragma = map toLower . takeWhile isAlphaNum . dropWhile isSpace $ str
                          in case lookup pragma pragmas of
                              Nothing -> False
                              _       -> True

isLinePragma str = let pragma = map toLower . takeWhile isAlphaNum . dropWhile isSpace $ str
                    in case pragma of
                        "line"  -> True
                        _       -> False

lexLinePragma :: Lex a (Int, String)
lexLinePragma = do
    discard 3   -- {-#
    lexWhile isSpace
    discard 4   -- LINE
    lexWhile isSpace
    i <- lexWhile isDigit
    lexWhile isSpace
    matchChar '"' "Improperly formatted LINE pragma"
    fn <- lexWhile (/= '"')
    matchChar '"' "Impossible - lexLinePragma"
    lexWhile isSpace
    mapM (flip matchChar "Improperly formatted LINE pragma") "#-}"
    lexNewline
    return (read i, fn)

lexNestedComment :: Bool -> String -> Lex a (Bool, String)
lexNestedComment bol str = do
    s <- getInput
    case s of
        '-':'}':_ -> discard 2 >> return (bol, str)
        '{':'-':_ -> do
            discard 2
            (bol, c) <- lexNestedComment bol ("-{" ++ str) -- rest of the subcomment
            lexNestedComment bol ("}-" ++ c  ) -- rest of this comment
        '\t':_    -> lexTab >> lexNestedComment bol ('\t':str)
        '\n':_    -> lexNewline >> lexNestedComment True ('\n':str)
        c:_       -> discard 1 >> lexNestedComment bol (c:str)
        []        -> fail "Unterminated nested comment"

-- When we are lexing the first token of a line, check whether we need to
-- insert virtual semicolons or close braces due to layout.

lexBOL :: Lex a Token
lexBOL = do
    pos <- getOffside
    -- trace ("Off: " ++ (show pos)) $ do
    case pos of
        LT -> do
                -- trace "layout: inserting '}'\n" $
            -- Set col to 0, indicating that we're still at the
            -- beginning of the line, in case we need a semi-colon too.
            -- Also pop the context here, so that we don't insert
            -- another close brace before the parser can pop it.
            setBOL
            popContextL "lexBOL"
            return VRightCurly
        EQ ->
            -- trace "layout: inserting ';'\n" $
            return SemiColon
        GT -> lexToken

lexToken :: Lex a Token
lexToken = do
    ec <- getExtContext
    -- we don't bother to check XmlSyntax since we couldn't
    -- have ended up in a non-Nothing context if it wasn't
    -- enabled.
    case ec of
     Just HarpCtxt     -> lexHarpToken
     Just TagCtxt      -> lexTagCtxt
     Just CloseTagCtxt -> lexCloseTagCtxt
     Just ChildCtxt    -> lexChildCtxt
     Just CodeTagCtxt  -> lexCodeTagCtxt
     _         -> lexStdToken


lexChildCtxt :: Lex a Token
lexChildCtxt = do
    -- if we ever end up here, then XmlSyntax must be on.
    s <- getInput
    case s of
        '<':'%':'>':_ -> do discard 3
                            pushExtContextL ChildCtxt
                            return XChildTagOpen
        '<':'%':_ -> do discard 2
                        pushExtContextL CodeTagCtxt
                        return XCodeTagOpen
        '<':'/':_ -> do discard 2
                        popExtContextL "lexChildCtxt"
                        pushExtContextL CloseTagCtxt
                        return XCloseTagOpen
        '<':'[':_ -> do discard 2
                        pushExtContextL HarpCtxt
                        return XRPatOpen
        '<':_     -> do discard 1
                        pushExtContextL TagCtxt
                        return XStdTagOpen
        _     -> lexPCDATA


lexPCDATA :: Lex a Token
lexPCDATA = do
    -- if we ever end up here, then XmlSyntax must be on.
    s <- getInput
    case s of
        [] -> return EOF
        _  -> case s of
            '\n':_ -> do
                x <- lexNewline >> lexPCDATA
                case x of
                 XPCDATA p -> return $ XPCDATA $ '\n':p
                 EOF -> return EOF
            '<':_ -> return $ XPCDATA ""
            _ -> do let pcd = takeWhile (\c -> not $ elem c "<\n") s
                        l = length pcd
                    discard l
                    x <- lexPCDATA
                    case x of
                     XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd'
                     EOF -> return EOF


lexCodeTagCtxt :: Lex a Token
lexCodeTagCtxt = do
    -- if we ever end up here, then XmlSyntax must be on.
    s <- getInput
    case s of
        '%':'>':_ -> do discard 2
                        popExtContextL "lexCodeTagContext"
                        return XCodeTagClose
        _     -> lexStdToken

lexCloseTagCtxt :: Lex a Token
lexCloseTagCtxt = do
    -- if we ever end up here, then XmlSyntax must be on.
    s <- getInput
    case s of
        '%':'>':_ -> do discard 2
                        popExtContextL "lexCloseTagCtxt"
                        return XCodeTagClose
        '>':_     -> do discard 1
                        popExtContextL "lexCloseTagCtxt"
                        return XStdTagClose
        _     -> lexStdToken

lexTagCtxt :: Lex a Token
lexTagCtxt = do
    -- if we ever end up here, then XmlSyntax must be on.
    s <- getInput
    case s of
        '/':'>':_ -> do discard 2
                        popExtContextL "lexTagCtxt: Empty tag"
                        return XEmptyTagClose
        '>':_     -> do discard 1
                        popExtContextL "lexTagCtxt: Standard tag"
                        pushExtContextL ChildCtxt
                        return XStdTagClose
        _     -> lexStdToken

lexHarpToken :: Lex a Token
lexHarpToken = do
    -- if we ever end up here, then RegularPatterns must be on.
    s <- getInput
    case s of
        ']':'>':_ -> do discard 2
                        popExtContextL "lexHarpToken"
                        return XRPatClose
        _     -> lexStdToken

lexStdToken :: Lex a Token
lexStdToken = do
    s <- getInput
    exts <- getExtensionsL
    case s of
        [] -> return EOF

        '0':c:d:_ | toLower c == 'o' && isOctDigit d -> do
                        discard 2
                        (n, str) <- lexOctal
                        return (IntTok (n, '0':c:str))
                  | toLower c == 'x' && isHexDigit d -> do
                        discard 2
                        (n, str) <- lexHexadecimal
                        return (IntTok (n, '0':c:str))

        -- implicit parameters
        '?':c:_ | isLower c && ImplicitParams `elem` exts -> do
                        discard 1
                        id <- lexWhile isIdent
                        return $ IDupVarId id

        '%':c:_ | isLower c && ImplicitParams `elem` exts -> do
                        discard 1
                        id <- lexWhile isIdent
                        return $ ILinVarId id
        -- end implicit parameters

        -- harp
        '(':'|':c:_  | isHSymbol c -> discard 1 >> return LeftParen
        '(':'|':_ | RegularPatterns `elem` exts ->
                     do discard 2
                        return RPGuardOpen
        '|':')':_ | RegularPatterns `elem` exts ->
                     do discard 2
                        return RPGuardClose
        {- This is handled by the reserved_ops above.
        '@':':':_ | RegularPatterns `elem` exts ->
                     do discard 2
                        return RPCAt -}

        -- template haskell
        '[':'|':_ | TemplateHaskell `elem` exts -> do
                discard 2
                return $ THExpQuote

        '[':c:'|':_ | c == 'e' && TemplateHaskell `elem` exts -> do
                        discard 3
                        return $ THExpQuote
                    | c == 'p' && TemplateHaskell `elem` exts -> do
                        discard 3
                        return THPatQuote
                    | c == 'd' && TemplateHaskell `elem` exts -> do
                        discard 3
                        return THDecQuote
                    | c == 't' && TemplateHaskell `elem` exts -> do
                        discard 3
                        return THTypQuote
        '[':'$':c:_ | isLower c && QuasiQuotes `elem` exts ->
                        discard 2 >> lexQuasiQuote

        '|':']':_ | TemplateHaskell `elem` exts -> do
                        discard 2
                        return THCloseQuote

        '$':c:_ | isLower c && TemplateHaskell `elem` exts -> do
                        discard 1
                        id <- lexWhile isIdent
                        return $ THIdEscape id
                | c == '(' && TemplateHaskell `elem` exts -> do
                        discard 2
                        return THParenEscape
        -- end template haskell

        -- hsx
        '<':'%':c:_ | XmlSyntax `elem` exts -> do 
                        case c of
                         '>' -> do discard 3
                                   pushExtContextL ChildCtxt
                                   return XChildTagOpen
                         _   -> do discard 2
                                   pushExtContextL CodeTagCtxt
                                   return XCodeTagOpen
        '<':c:_ | isAlpha c && XmlSyntax `elem` exts -> do
                        discard 1
                        pushExtContextL TagCtxt
                        return XStdTagOpen
        -- end hsx

        '(':'#':_ | UnboxedTuples `elem` exts -> do discard 2 >> return LeftHashParen

        '#':')':_ | UnboxedTuples `elem` exts -> do discard 2 >> return RightHashParen

        '{':'|':_ | Generics `elem` exts -> do discard 2 >> return LeftCurlyBar

        '|':'}':_ | Generics `elem` exts -> do discard 2 >> return RightCurlyBar

        -- pragmas

        '{':'-':'#':_ -> do discard 3 >> lexPragmaStart

        '#':'-':'}':_ -> do discard 3 >> return PragmaEnd

        c:_ | isDigit c -> lexDecimalOrFloat

            | isUpper c -> lexConIdOrQual ""

            | isLower c || c == '_' -> do
                    idents <- lexIdents
                    case idents of
                     [ident] -> case lookup ident (reserved_ids ++ special_varids) of
                                 Just (keyword, scheme) -> do
                                    -- check if an extension keyword is enabled
                                    if isEnabled scheme exts
                                     then flagKW keyword >> return keyword
                                     else return $ VarId ident
                                 Nothing -> return $ VarId ident
                     _ -> return $ DVarId idents

            | isHSymbol c -> do
                    sym <- lexWhile isHSymbol
                    return $ case lookup sym (reserved_ops ++ special_varops) of
                              Just (t , scheme) ->
                                -- check if an extension op is enabled
                                if isEnabled scheme exts
                                 then t
                                 else case c of
                                        ':' -> ConSym sym
                                        _   -> VarSym sym
                              Nothing -> case c of
                                          ':' -> ConSym sym
                                          _   -> VarSym sym

            | otherwise -> do
                    discard 1
                    case c of

                        -- First the special symbols
                        '(' ->  return LeftParen
                        ')' ->  return RightParen
                        ',' ->  return Comma
                        ';' ->  return SemiColon
                        '[' ->  return LeftSquare
                        ']' ->  return RightSquare
                        '`' ->  return BackQuote
                        '{' -> do
                            pushContextL NoLayout
                            return LeftCurly
                        '}' -> do
                            popContextL "lexStdToken"
                            return RightCurly

                        '\'' -> lexCharacter
                        '"' ->  lexString

                        _ ->    fail ("Illegal character \'" ++ show c ++ "\'\n")

      where lexIdents :: Lex a [String]
            lexIdents = do
                ident <- lexWhile isIdent
                s <- getInput
                exts <- getExtensionsL
                case s of
                 -- This is the only way we can get more than one ident in the list
                 -- and it requires XmlSyntax to be on.
                 '-':c:_ | XmlSyntax `elem` exts && isAlpha c -> do
                        discard 1
                        idents <- lexIdents
                        return $ ident : idents
                 '#':_ | MagicHash `elem` exts -> do
                        discard 1
                        return [ident ++ "#"]
                 _ -> return [ident]

            lexQuasiQuote :: Lex a Token
            lexQuasiQuote = do
                -- We've seen and dropped [$ already
                ident <- lexWhile isIdent
                matchChar '|' "Malformed quasi-quote quoter"
                body <- lexQQBody
                return $ THQuasiQuote (ident, body)

            lexQQBody :: Lex a String
            lexQQBody = do
                s <- getInput
                case s of
                  '\\':']':_ -> do discard 2
                                   str <- lexQQBody
                                   return (']':str)
                  '\\':'|':_ -> do discard 2
                                   str <- lexQQBody
                                   return ('|':str)
                  '|':']':_  -> discard 2 >> return ""
                  '|':_ -> do discard 1
                              str <- lexQQBody
                              return ('|':str)
                  ']':_ -> do discard 1
                              str <- lexQQBody
                              return (']':str)
                  '\\':_ -> do discard 1
                               str <- lexQQBody
                               return ('\\':str)
                  '\n':_ -> do lexNewline
                               str <- lexQQBody
                               return ('\n':str)
                  []     -> fail "Unexpected end of input while lexing quasi-quoter"
                  _ -> do str <- lexWhile (not . (`elem` "\\|\n"))
                          rest <- lexQQBody
                          return (str++rest)

lexPragmaStart :: Lex a Token
lexPragmaStart = do
    lexWhile isSpace
    pr <- lexWhile isAlphaNum
    case lookup (map toLower pr) pragmas of
     Just (INLINE True) -> do
            s <- getInput
            case map toLower s of
             '_':'c':'o':'n':'l':'i':'k':'e':_  -> do
                      discard 8
                      return $ INLINE_CONLIKE
             _ -> return $ INLINE True
     Just SPECIALISE -> do
            s <- getInput
            case dropWhile isSpace $ map toLower s of
             'i':'n':'l':'i':'n':'e':_ -> do
                      lexWhile isSpace
                      discard 6
                      return $ SPECIALISE_INLINE True
             'n':'o':'i':'n':'l':'i':'n':'e':_ -> do
                        lexWhile isSpace
                        discard 8
                        return $ SPECIALISE_INLINE False
             'n':'o':'t':'i':'n':'l':'i':'n':'e':_ -> do
                        lexWhile isSpace
                        discard 9
                        return $ SPECIALISE_INLINE False
             _ -> return SPECIALISE

     Just (OPTIONS _) -> do     -- see, I promised we'd mask out the 'undefined'
            s <- getInput
            case s of
             '_':_  -> do
                discard 1
                com <- lexWhile isIdent
                rest <- lexRawPragma
                return $ OPTIONS (Just com, rest)
             x:_ | isSpace x -> do
                rest <- lexRawPragma
                return $ OPTIONS (Nothing, rest)
             _ -> fail "Malformed Options pragma"
{-     Just (CFILES _) -> do
            rest <- lexRawPragma
            return $ CFILES rest
     Just (INCLUDE _) -> do
            rest <- lexRawPragma
            return $ INCLUDE rest -}
     Just p ->  return p

     _      -> fail "Internal error: Unrecognised recognised pragma"
                  -- do rawStr <- lexRawPragma
                  -- return $ PragmaUnknown (pr, rawStr) -- no support for unrecognized pragmas, treat as comment
                  -- discard 3 -- #-}
                  -- topLexer -- we just discard it as a comment for now and restart -}

lexRawPragma :: Lex a String
lexRawPragma = do
    rpr <- lexRawPragmaAux
    return $ dropWhile isSpace rpr
 where lexRawPragmaAux = do
        rpr <- lexWhile (/='#')
        s <- getInput
        case s of
         '#':'-':'}':_  -> return rpr
         "" -> fail "End-of-file inside pragma"
         _ -> do
            discard 1
            rpr' <- lexRawPragma
            return $ rpr ++ '#':rpr'

lexDecimalOrFloat :: Lex a Token
lexDecimalOrFloat = do
    ds <- lexWhile isDigit
    rest <- getInput
    exts <- getExtensionsL
    case rest of
        ('.':d:_) | isDigit d -> do
                discard 1
                frac <- lexWhile isDigit
                let num = parseInteger 10 (ds ++ frac)
                    decimals = toInteger (length frac)
                (exponent, estr) <- do
                    rest2 <- getInput
                    case rest2 of
                        'e':_ -> lexExponent
                        'E':_ -> lexExponent
                        _     -> return (0,"")
                con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
                return $ con ((num%1) * 10^^(exponent - decimals), ds ++ '.':frac ++ estr)
        e:_ | toLower e == 'e' -> do
                (exponent, estr) <- lexExponent
                con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash)
                return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds ++ estr)
        '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds))
        '#':_     | MagicHash `elem` exts -> discard 1 >> return (IntTokHash  (parseInteger 10 ds, ds))
        _         ->              return (IntTok      (parseInteger 10 ds, ds))

    where
    lexExponent :: Lex a (Integer, String)
    lexExponent = do
        (e:r) <- getInput
        discard 1   -- 'e' or 'E'
        case r of
         '+':d:_ | isDigit d -> do
            discard 1
            (n, str) <- lexDecimal
            return (n, e:'+':str)
         '-':d:_ | isDigit d -> do
            discard 1
            (n, str) <- lexDecimal
            return (negate n, e:'-':str)
         d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, e:str)
         _ -> fail "Float with missing exponent"

lexHash :: (b -> Token) -> (b -> Token) -> Either String (b -> Token) -> Lex a (b -> Token)
lexHash a b c = do
    exts <- getExtensionsL
    if MagicHash `elem` exts
     then do
        r <- getInput
        case r of
         '#':'#':_ -> case c of
                       Right c -> discard 2 >> return c
                       Left s  -> fail s
         '#':_     -> discard 1 >> return b
         _         ->              return a
     else return a

lexConIdOrQual :: String -> Lex a Token
lexConIdOrQual qual = do
        con <- lexWhile isIdent
        let conid | null qual = ConId con
                  | otherwise = QConId (qual,con)
            qual' | null qual = con
                  | otherwise = qual ++ '.':con
        just_a_conid <- alternative (return conid)
        rest <- getInput
        exts <- getExtensionsL
        case rest of
          '.':c:_
             | isLower c || c == '_' -> do  -- qualified varid?
                    discard 1
                    ident <- lexWhile isIdent
                    s <- getInput
                    exts <- getExtensionsL
                    ident' <- case s of
                               '#':_ | MagicHash `elem` exts -> discard 1 >> return (ident ++ "#")
                               _ -> return ident
                    case lookup ident' reserved_ids of
                       -- cannot qualify a reserved word
                       Just (_,scheme) | isEnabled scheme exts  -> just_a_conid
                       _ -> return (QVarId (qual', ident'))

             | isUpper c -> do      -- qualified conid?
                    discard 1
                    lexConIdOrQual qual'

             | isHSymbol c -> do    -- qualified symbol?
                    discard 1
                    sym <- lexWhile isHSymbol
                    exts <- getExtensionsL
                    case lookup sym reserved_ops of
                        -- cannot qualify a reserved operator
                        Just (_,scheme) | isEnabled scheme exts -> just_a_conid
                        _        -> return $ case c of
                                              ':' -> QConSym (qual', sym)
                                              _   -> QVarSym (qual', sym)

          '#':c:_
            | not (isHSymbol c) && not (isIdent c) && MagicHash `elem` exts -> do
                discard 1
                case conid of
                 ConId con -> return $ ConId $ con ++ "#"
                 QConId (q,con) -> return $ QConId (q,con ++ "#")
          _ ->  return conid -- not a qualified thing

lexCharacter :: Lex a Token
lexCharacter = do   -- We need to keep track of not only character constants but also TH 'x and ''T
        -- We've seen ' so far
        s <- getInput
        exts <- getExtensionsL
        case s of
         '\'':_ | TemplateHaskell `elem` exts -> discard 1 >> return THTyQuote
         '\\':_ -> do
                    (c,raw) <- lexEscape
                    matchQuote
                    con <- lexHash Character CharacterHash
                            (Left "Double hash not available for character literals")
                    return (con (c, '\\':raw))
         c:'\'':_ -> do
                    discard 2
                    con <- lexHash Character CharacterHash
                            (Left "Double hash not available for character literals")
                    return (con (c, [c]))
         _ | TemplateHaskell `elem` exts -> return THVarQuote
         _ -> fail "Improper character constant or misplaced \'"

    where matchQuote = matchChar '\'' "Improperly terminated character constant"


lexString :: Lex a Token
lexString = loop ("","")
    where
    loop (s,raw) = do
        r <- getInput
        exts <- getExtensionsL
        case r of
            '\\':'&':_ -> do
                    discard 2
                    loop (s, '&':'\\':raw)
            '\\':c:_ | isSpace c -> do
                        discard 1
                        wcs <- lexWhiteChars
                        matchChar '\\' "Illegal character in string gap"
                        loop (s, '\\':reverse wcs ++ '\\':raw)
                     | otherwise -> do
                        (ce, str) <- lexEscape
                        loop (ce:s, reverse str ++ '\\':raw)
            '"':'#':_ | MagicHash `elem` exts -> do
                        discard 2
                        return (StringHash (reverse s, reverse raw))
            '"':_ -> do
                discard 1
                return (StringTok (reverse s, reverse raw))
            c:_ -> do
                discard 1
                loop (c:s, c:raw)
            [] ->   fail "Improperly terminated string"

    lexWhiteChars :: Lex a String
    lexWhiteChars = do
        s <- getInput
        case s of
            '\n':_ -> do
                    lexNewline
                    wcs <- lexWhiteChars
                    return $ '\n':wcs
            '\t':_ -> do
                    lexTab
                    wcs <- lexWhiteChars
                    return $ '\t':wcs
            c:_ | isSpace c -> do
                    discard 1
                    wcs <- lexWhiteChars
                    return $ c:wcs
            _ -> return ""

lexEscape :: Lex a (Char, String)
lexEscape = do
    discard 1
    r <- getInput
    case r of

-- Production charesc from section B.2 (Note: \& is handled by caller)

        'a':_           -> discard 1 >> return ('\a', "a")
        'b':_           -> discard 1 >> return ('\b', "b")
        'f':_           -> discard 1 >> return ('\f', "f")
        'n':_           -> discard 1 >> return ('\n', "n")
        'r':_           -> discard 1 >> return ('\r', "r")
        't':_           -> discard 1 >> return ('\t', "t")
        'v':_           -> discard 1 >> return ('\v', "v")
        '\\':_          -> discard 1 >> return ('\\', "\\")
        '"':_           -> discard 1 >> return ('\"', "\"")
        '\'':_          -> discard 1 >> return ('\'', "\'")

-- Production ascii from section B.2

        '^':c:_         -> discard 2 >> cntrl c
        'N':'U':'L':_   -> discard 3 >> return ('\NUL', "NUL")
        'S':'O':'H':_   -> discard 3 >> return ('\SOH', "SOH")
        'S':'T':'X':_   -> discard 3 >> return ('\STX', "STX")
        'E':'T':'X':_   -> discard 3 >> return ('\ETX', "ETX")
        'E':'O':'T':_   -> discard 3 >> return ('\EOT', "EOT")
        'E':'N':'Q':_   -> discard 3 >> return ('\ENQ', "ENQ")
        'A':'C':'K':_   -> discard 3 >> return ('\ACK', "ACK")
        'B':'E':'L':_   -> discard 3 >> return ('\BEL', "BEL")
        'B':'S':_       -> discard 2 >> return ('\BS',  "BS")
        'H':'T':_       -> discard 2 >> return ('\HT',  "HT")
        'L':'F':_       -> discard 2 >> return ('\LF',  "LF")
        'V':'T':_       -> discard 2 >> return ('\VT',  "VT")
        'F':'F':_       -> discard 2 >> return ('\FF',  "FF")
        'C':'R':_       -> discard 2 >> return ('\CR',  "CR")
        'S':'O':_       -> discard 2 >> return ('\SO',  "SO")
        'S':'I':_       -> discard 2 >> return ('\SI',  "SI")
        'D':'L':'E':_   -> discard 3 >> return ('\DLE', "DLE")
        'D':'C':'1':_   -> discard 3 >> return ('\DC1', "DC1")
        'D':'C':'2':_   -> discard 3 >> return ('\DC2', "DC2")
        'D':'C':'3':_   -> discard 3 >> return ('\DC3', "DC3")
        'D':'C':'4':_   -> discard 3 >> return ('\DC4', "DC4")
        'N':'A':'K':_   -> discard 3 >> return ('\NAK', "NAK")
        'S':'Y':'N':_   -> discard 3 >> return ('\SYN', "SYN")
        'E':'T':'B':_   -> discard 3 >> return ('\ETB', "ETB")
        'C':'A':'N':_   -> discard 3 >> return ('\CAN', "CAN")
        'E':'M':_       -> discard 2 >> return ('\EM',  "EM")
        'S':'U':'B':_   -> discard 3 >> return ('\SUB', "SUB")
        'E':'S':'C':_   -> discard 3 >> return ('\ESC', "ESC")
        'F':'S':_       -> discard 2 >> return ('\FS',  "FS")
        'G':'S':_       -> discard 2 >> return ('\GS',  "GS")
        'R':'S':_       -> discard 2 >> return ('\RS',  "RS")
        'U':'S':_       -> discard 2 >> return ('\US',  "US")
        'S':'P':_       -> discard 2 >> return ('\SP',  "SP")
        'D':'E':'L':_   -> discard 3 >> return ('\DEL', "DEL")

-- Escaped numbers

        'o':c:_ | isOctDigit c -> do
                    discard 1
                    (n, raw) <- lexOctal
                    n <- checkChar n
                    return (n, 'o':raw)
        'x':c:_ | isHexDigit c -> do
                    discard 1
                    (n, raw) <- lexHexadecimal
                    n <- checkChar n
                    return (n, 'x':raw)
        c:_ | isDigit c -> do
                    (n, raw) <- lexDecimal
                    n <- checkChar n
                    return (n, raw)

        _       -> fail "Illegal escape sequence"

    where
    checkChar n | n <= 0x01FFFF = return (chr (fromInteger n))
    checkChar _                 = fail "Character constant out of range"

-- Production cntrl from section B.2

    cntrl :: Char -> Lex a (Char, String)
    cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), '^':c:[])
    cntrl _                        = fail "Illegal control character"

-- assumes at least one octal digit
lexOctal :: Lex a (Integer, String)
lexOctal = do
    ds <- lexWhile isOctDigit
    return (parseInteger 8 ds, ds)

-- assumes at least one hexadecimal digit
lexHexadecimal :: Lex a (Integer, String)
lexHexadecimal = do
    ds <- lexWhile isHexDigit
    return (parseInteger 16 ds, ds)

-- assumes at least one decimal digit
lexDecimal :: Lex a (Integer, String)
lexDecimal = do
    ds <- lexWhile isDigit
    return (parseInteger 10 ds, ds)

-- Stolen from Hugs's Prelude
parseInteger :: Integer -> String -> Integer
parseInteger radix ds =
    foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds)

flagKW :: Token -> Lex a ()
flagKW t = when (t `elem` [KW_Do, KW_MDo]) flagDo

------------------------------------------------------------------
-- "Pretty" printing for tokens

showToken :: Token -> String
showToken t = case t of
  VarId s           -> s
  QVarId (q,s)      -> q ++ '.':s
  IDupVarId s       -> '?':s
  ILinVarId s       -> '%':s
  ConId s           -> s
  QConId (q,s)      -> q ++ '.':s
  DVarId ss         -> concat $ intersperse "-" ss
  VarSym s          -> s
  ConSym s          -> s
  QVarSym (q,s)     -> q ++ '.':s
  QConSym (q,s)     -> q ++ '.':s
  IntTok (_, s)         -> s
  FloatTok (_, s)       -> s
  Character (_, s)      -> '\'':s ++ "'"
  StringTok (_, s)      -> '"':s ++ "\""
  IntTokHash (_, s)     -> s ++ "#"
  WordTokHash (_, s)    -> s ++ "##"
  FloatTokHash (_, s)   -> s ++ "#"
  DoubleTokHash (_, s)  -> s ++ "##"
  CharacterHash (_, s)  -> '\'':s ++ "'#"
  StringHash (_, s)     -> '"':s ++ "\"#"
  LeftParen         -> "("
  RightParen        -> ")"
  LeftHashParen     -> "(#"
  RightHashParen    -> "#)"
  LeftCurlyBar      -> "{|"
  RightCurlyBar     -> "|}"
  SemiColon         -> ";"
  LeftCurly         -> "{"
  RightCurly        -> "}"
  VRightCurly       -> "virtual }"
  LeftSquare        -> "["
  RightSquare       -> "]"
  Comma             -> ","
  Underscore        -> "_"
  BackQuote         -> "`"
  Dot               -> "."
  DotDot            -> ".."
  Colon             -> ":"
  DoubleColon       -> "::"
  Equals            -> "="
  Backslash         -> "\\"
  Bar               -> "|"
  LeftArrow         -> "<-"
  RightArrow        -> "->"
  At                -> "@"
  Tilde             -> "~"
  DoubleArrow       -> "=>"
  Minus             -> "-"
  Exclamation       -> "!"
  Star              -> "*"
  LeftArrowTail     -> ">-"
  RightArrowTail    -> "-<"
  LeftDblArrowTail  -> ">>-"
  RightDblArrowTail -> "-<<"
  THExpQuote        -> "[|"
  THPatQuote        -> "[p|"
  THDecQuote        -> "[d|"
  THTypQuote        -> "[t|"
  THCloseQuote      -> "|]"
  THIdEscape s      -> '$':s
  THParenEscape     -> "$("
  THVarQuote        -> "'"
  THTyQuote         -> "''"
  THQuasiQuote (n,q) -> "[$" ++ n ++ "|" ++ q ++ "]"
  RPGuardOpen       -> "(|"
  RPGuardClose      -> "|)"
  RPCAt             -> "@:"
  XCodeTagOpen      -> "<%"
  XCodeTagClose     -> "%>"
  XStdTagOpen       -> "<"
  XStdTagClose      -> ">"
  XCloseTagOpen     -> "</"
  XEmptyTagClose    -> "/>"
  XPCDATA s         -> "PCDATA " ++ s
  XRPatOpen         -> "<["
  XRPatClose        -> "]>"
  PragmaEnd         -> "#-}"
  RULES             -> "{-# RULES"
  INLINE b          -> "{-# " ++ if b then "INLINE" else "NOINLINE"
  INLINE_CONLIKE    -> "{-# " ++ "INLINE_CONLIKE"
  SPECIALISE        -> "{-# SPECIALISE"
  SPECIALISE_INLINE b -> "{-# SPECIALISE " ++ if b then "INLINE" else "NOINLINE"
  SOURCE            -> "{-# SOURCE"
  DEPRECATED        -> "{-# DEPRECATED"
  WARNING           -> "{-# WARNING"
  SCC               -> "{-# SCC"
  GENERATED         -> "{-# GENERATED"
  CORE              -> "{-# CORE"
  UNPACK            -> "{-# UNPACK"
  OPTIONS (mt,s)    -> "{-# OPTIONS" ++ maybe "" (':':) mt ++ " ..."
--  CFILES  s         -> "{-# CFILES ..."
--  INCLUDE s         -> "{-# INCLUDE ..."
  LANGUAGE          -> "{-# LANGUAGE"
  ANN               -> "{-# ANN"
  KW_As         -> "as"
  KW_By         -> "by"
  KW_Case       -> "case"
  KW_Class      -> "class"
  KW_Data       -> "data"
  KW_Default    -> "default"
  KW_Deriving   -> "deriving"
  KW_Do         -> "do"
  KW_MDo        -> "mdo"
  KW_Else       -> "else"
  KW_Family     -> "family"
  KW_Forall     -> "forall"
  KW_Group      -> "group"
  KW_Hiding     -> "hiding"
  KW_If         -> "if"
  KW_Import     -> "import"
  KW_In         -> "in"
  KW_Infix      -> "infix"
  KW_InfixL     -> "infixl"
  KW_InfixR     -> "infixr"
  KW_Instance   -> "instance"
  KW_Let        -> "let"
  KW_Module     -> "module"
  KW_NewType    -> "newtype"
  KW_Of         -> "of"
  KW_Proc       -> "proc"
  KW_Rec        -> "rec"
  KW_Then       -> "then"
  KW_Type       -> "type"
  KW_Using      -> "using"
  KW_Where      -> "where"
  KW_Qualified  -> "qualified"
  KW_Foreign    -> "foreign"
  KW_Export     -> "export"
  KW_Safe       -> "safe"
  KW_Unsafe     -> "unsafe"
  KW_Threadsafe -> "threadsafe"
  KW_StdCall    -> "stdcall"
  KW_CCall      -> "ccall"

  EOF           -> "EOF"