lparse.lua

-- Lua parser. v0.5

-- Copyright 2004 by Rici Lake. Permission is granted to use this code under
-- the same terms and conditions as found in the Lua copyright notice at
-- http://www.lua.org/license.html.
--
-- I'd appreciate a credit if you use this code, but I don't insist. 
--
-- This is prerelease software, no interfaces are guaranteed.
-- Use at your own risk, etc.

-- Documentation is at the end :)
  
-- TODO
--
--  The reuse of TagData for this or that or the other was a cutting of corners,
--  which as usual has come back to haunt; the handling of "missing" is a
--  perfect illustration. So the interfaces will need to change to use a
--  three-part tag: type, original, scope-or-name.
--
--  But what it really ought to do is become a coroutine-based generator.
--  The only issue here is proper handling of deferred tags (comments,
--  whitespace, skips); none of these have scope/var info, so only
--  two tables should be necessary; we'd also need to keep all info
--  for the "current tag" because we can't release it until we're done
--  with it (which is when another tag becomes current, or we're done.)
--
--  Doing a parse requires setting up a ton of tables, functions with upvalues,
--  etc. While that may not be a Bad Thing, in and of itself, it ought to be
--  possible to reuse a parser once it has all been created.
--
--  The globals table ought to have at least one level of subkeys, which
--  means the parser has to get a bit cleverer; probably all that term parsing
--  stuff should just be ripped out and replaced with a simple and complete
--  precedence parser; set/use could then be done on the expression parse tree.
--  
--  Try to remember what the hell I was thinking about when I put "see note" in
--  the tag documentation (referring to scopes) and then write the note.
--
--  If statements end up working around the interface with parseChunk and 
--  Scopes. parseChunk sets the TagData of the terminating symbol to
--  currentScope and then pops currentScope. But if statements use a fake
--  scope to keep the clauses linked, so this doesn't really work. (But it
--  is probably right everywhere else.)
--
--  Try to modularise a bit. Symtab/scope now has no external references.
--  Lexing depends only on newTag. Tagging is still deeply entrenched.
--  Perhaps this could be solved by adding newTag to the blex structure.

--  Anchor scopes the same way variables are anchored. 

--  Put the filename or some such indicator in the file scope object, so that
--  cross-file links can be constructed.

--  Try to handle SLSLPP colourising better. Right now, $ (USER) is not
--  recognised at all. The "right" way to do it would be to have two separate
--  parse streams running in parallel, although even then it will be hard to
--  get the unpreprocessed stuff right if people are doing anything even
--  moderately complicated. There is an interesting attempt to do php
--  "properly", making the HTML and the php indentations independent --
--  I saw a reference to it on comp.compilers. Maybe worth looking at.

--  Longer term: To integrate with real-time editing, we should be able to
--  tokenise incrementally, line per line. The trick would be to save the
--  lexer state for each line (there are only four states) and keep the lines
--  as vectors of tokens. Retokenising would only be needed for lines that
--  are changed (presumably, the editor would let us know) and tokenising could
--  stop at the first unchanged line where the states match.

--  Add some kind of LuaDOC comment parsing

--  Make it easier to construct parse trees, not just lint/tag

-------------------------------------------------------------------------
-- Utilities
-------------------------------------------------------------------------
local util = import "libutil"
local setFrom, setMulti, words 
      = util.setFrom, util.setMulti, util.words

local function ALWAYS(thing) return function() return thing end end
local alwaysTrue = ALWAYS(true)
local alwaysEmpty = ALWAYS ""
local function alwaysNil() end

-------------------------------------------------------------------------
-- Symbol table stuff
-------------------------------------------------------------------------
local function newVar(name, scope, anchor)
  local self = {name = name, scope = scope, def = anchor, global = scope.global}
  scope.vars[name] = self
  return self
end

local function newScope(outer, isFunction)
  local self = {
    fnDepth = outer.fnDepth + (isFunction and 1 or 0),
    depth = outer.depth + 1,
    vars = setmetatable({}, {__index = outer.vars}),
    parent = outer
  }
  if isFunction then newVar("return", self) end
  return self
end

local function newSemiScope(scope)
  return {
    fnDepth = scope.fnDepth,
    depth = scope.depth,
    vars = setmetatable({}, {__index = scope.vars}),
    parent = scope.parent
  }
end

local function lookupVar(name, scope, isSet, globalScope)
  local rv = scope.vars[name]
  if rv then
    if scope.fnDepth ~= rv.scope.fnDepth then rv.closed = not rv.global end
   else
    rv = newVar(name, globalScope)
    -- This is the only place we create global variables; the badGlobal key
    -- is here in order to help with linting. This will have to change to
    -- handle multiple file linting, where reference before define is OK.
    if not isSet or scope.fnDepth > 1 then rv.badGlobal = true end
  end
  scope.vars[name] = rv
  if isSet then rv.set = true else rv.used = true end
  return rv
end

local function newGlobalScope(globals)
  local vars = {}
  local globalScope = {
    fnDepth = 0,
    depth = 0,
    global = true,
    vars = vars
  }
  for k in globals do vars[k] = newVar(k, globalScope, "STDLIB") end
  return globalScope
end

-------------------------------------------------------------------------
-- Lua token sets                                                      --
-------------------------------------------------------------------------
local keyword = setFrom [[
  and      break    do       else     elseif   end
  false    for      function if       in       local
  nil      not      or       repeat   return   then
  true     until    while    

  +        -        *        /        ^        =
  ~=       <=       >=       <        >        ==
  (        )        {        }        [        ]
  ;        :        ,        .        ..       ...
]]

local synch = setFrom [[
  break    do       else     elseif   end      for
  function if       local    repeat   return   until
  while    EOF
]]

local expectUntil  = setFrom [[until]]
local expectElseif = setFrom [[elseif else end]]
local expectEnd    = setFrom [[end]]
local expectEOF    = setFrom [[EOF]]

return function(pkg)
  function pkg.parse(lex, globals)
    local ntag, Tag, Tagval = 0, {}, {} 
    local ttype, token, lasttag
    local luapats
  
  -------------------------------------------------------------------------------
  -- variable scoping
  -------------------------------------------------------------------------------
    local globalScope = newGlobalScope(globals)
    local currentScope = newScope(globalScope, true)
    newVar("return", currentScope)
    
  -------------------------------------------------------------------------------
  -- tagging strategy for colourising.
  -------------------------------------------------------------------------------
  -- lasttag is the last "meaningful" tag; it is not set in the case of comments
  -- or whitespace. It's not a very clean interface, really, but then it's not a
  -- very clean concept either. This really ought to be replaced by a "commit"
  -- type interface, so we can return tags when we know nothing will change.
  --
  -- newTag does not set lasttag, which allows spaces and comments to insert
  -- themselves. "skip" is also inserted with newTag.
  -- tagAfter inserts a tag after lasttag; this allows "missing" and "endskip"
  -- to get inserted in the supposedly correct place. Both of these update
  -- lasttag ("endskip" does it manually, thereby breaking encapsulation). 
  
    local function newTag(tt, val)
      ntag = ntag + 1
      Tag[ntag], Tagval[ntag] = tt, val or lex.token()
      return ntag
    end
    
    local function tagAfter(tt, val)
      local idx = lasttag + 1
      for i = ntag, idx, -1 do Tag[i + 1], Tagval[i + 1] = Tag[i], Tagval[i] end
      Tag[idx], Tagval[idx], ntag = tt, val, ntag + 1
      return idx
    end
    
    local function setTag(tt)
      Tag[lasttag] = tt
    end
  
    local function tag() return Tag[lasttag] end
  
    -- hack, this is not the right way to do things
    local function setTagData(val)
      if Tag[lasttag] ~= "missing" then
        Tagval[lasttag] = val
      end
    end
  
    local function tagData() return Tagval[lasttag] end
    
    -- FIXME Should invent really set lasttag to where the missing is?
    local function invent(toke)
      lasttag = tagAfter("missing", toke)
    end
    
    local function badchar(self)
      if self.match1"." then
        newTag "invalid"
        return self.lex(luapats)
      end
    end
    
    local function next()
      ttype, token = lex.lex(luapats, badchar)
      if not ttype then ttype = "EOF" end
    end
  
    -- accept must return lasttag (to avoid making lasttag visible in too
    -- many places) which means that match also returns lasttag (or nil)
    local function accept() lasttag = newTag(ttype, token); next(); return lasttag end
    local function peek(tt) return ttype == tt end
    local function match(tt) if ttype == tt then return accept() end end
    local function mustMatch(tt) if ttype == tt then return accept() else invent(tt) end end
    local function matchSet(ttab) local tt = ttab[ttype]; if tt then accept(); return tt() end end
  
    -- FIXME should this be var or varset? I think var is correct.
    local function matchNewVar(scope)
      if ttype == "id" then
       lasttag = newTag("var", token)
       setTagData(newVar(token, scope, lasttag))
       next()
       return true
      end
    end
    
    -- This returns lasttag for the benefit of dotColonTab
    local function mustMatchKey()
      if ttype == "id" then lasttag = newTag("key", token); next()
                       else invent "<key>"
      end
      return lasttag
    end
  
  -- for ... , local ... and function ( ... ) are all slightly different.
  -- but we use this one for all of them anyway. Returns #var, has_...
    local function matchVarList(scope)
      if not matchNewVar(scope) then return 0, match "..." end
      local rv = 1
      while match "," do
        rv = rv + 1
        if not matchNewVar(scope) then
          if match "..." then return rv - 1, lasttag
                         else invent "<var>"
          end
        end
      end
      return rv
    end
    
    -- We need to test the tag because the "key" might have been invented.
    local function defKey() if tag() == "key" then setTag "keydef" end end
    local function useVar(isDef)
      setTag(isDef and "varset" or "var")
      setTagData(lookupVar(tagData(), currentScope, isDef, globalScope))
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Lexer
  ----------------------------------------------------------------------------------------------------
  
  -- TODO: The whole point of this block is to create luapats; the only upvalues here are the
  --       constant table keyword and the function newTag. <keyword> could actually be defined
  --       here, since it is the only place it is used. (It also refers to a few utility
  --       functions). So: how do we parameterise this with newTag and export it as a 
  --       separate lexing package?
    do
      local function numberExtend(self)
        self.match1("^E[-+]?%d+")
        return "number"
      end
      
      local escapepats = {
         ["^\\%d%d?%d?"]           = ALWAYS "escape",
         ["^\\."]                  = ALWAYS "escape",
         ["^[^'\"\\\n]*"]          = ALWAYS "string",
         ["^([\"'])"]              = function(_, c) return "string", c end
      }
  
  -- Changing self.tokens to self.extends is most of what is needed to consolidate
  -- strings, if that is what is wanted. (Plus copying the error stuff from longExtend) 
      local function stringExtender(q)
        return function(self)
          newTag "string"
          for f, r in self.tokens(escapepats, alwaysNil) do
            if r == q then return "string"
                      else newTag(f)
            end
          end
          newTag("missing", q)
          return "string"
        end
      end
      
      local function longExtend(self, ttype)
        local depth = 1
        while depth > 0 do 
          self.match1("^[^%[%]]*")
          if      self.match1("^%[%[") then depth = depth + 1
           elseif self.match1("^%]%]") then depth = depth - 1
           elseif not self.match1(".") then
            newTag(ttype)
            newTag("missing", string.rep("]]", depth))
            -- Zap the token. cursor is one past the end of the input.
            self.setToken(-1)
            break
          end
        end
        return ttype
      end
      
      local function checkSymbol(self, op)
        if keyword[op] then return op, op
                       else newTag("invalid", op); return self.lex(luapats, badchar)
        end
      end
      
      -- defined local above  
      luapats = {
         ["^([%a_][%w_]*)"]        =  function(_, word) return (keyword[word] and word or "id"), word end,
         ["^%s+"]                  =  function(self)
                                        newTag "whitespace"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^%-%-"]                 =  function(self)
                                        if self.match1("^%[%[") then
                                          longExtend(self, "comment")
                                         else self.match1 "[^\n]*"
                                         --else repeat self.match1 "[^\n]*" until not self.match1 "^%s*%-%-"
                                        end
                                        newTag "comment"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^#"]                    =  function(self)
                                        self.match1 "[^\n]*"
                                        newTag "preprocessor"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^%[%["]                 =  function(self) return longExtend(self, "string") end,
         ["^%d+%.?%d*"]            =  numberExtend,
         ["^%.%d+"]                =  numberExtend,
         ["^([(){}%[%]+%-*/^;:,])"]=  checkSymbol,
         ["^([<>=~]=?)"]           =  checkSymbol,
         ["^(%.+)"]                =  checkSymbol,
         ["^'"]                    =  stringExtender "'",
         ['^"']                    =  stringExtender '"',
      }
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Expression parsing
  ----------------------------------------------------------------------------------------------------  
  
    -- Expression parsing has the following contexts:
    -- 1. (expr)   unop / string / { tC / function fB / number / const / id / ( expr ) / other
    --               1      3        3        3            3       3      2       2      need <expr>
    -- 2. (postfix) string / { tC / ( exprlist ) / . tag /  [ expr ] / : tag / binop / other
    --                2        2          2           2         2        4       1     accept
    -- 3. (binop)   binop  / other
    --                1      accept
    -- 4. (call)    string / { tC / ( exprlist ) / other
    --                 2       2         2         need ()
    --
    -- In the statement case, binops are not permitted, and state 1 never reappears, while state 3 reduces
    -- to accept. So we define states 2* and 4* (PostfixOnly and CallOnly).
    
    local binop = setFrom [[+ - * / ^ ~= <= >= < > == .. and or]]
    local unop  = setFrom [[- not]]
    local const = setFrom [[number string false nil true]]
    
    local exprValTab, exprBinopTab, exprPostTab, exprCallTab, exprPostOnlyTab, exprCallOnlyTab =
          {}, {}, {}, {}, {}, {}
  
    local function parseExpr() return matchSet(exprValTab) end
    -- The following are defined to allow for tailcalling.
    local function mustParseExpr()
      local fn = exprValTab[ttype]
      if fn then accept(); return fn()
            else invent "<expr>"
      end
    end
   
    local function parseBinop()
      local fn = exprBinopTab[ttype]
      if fn then accept(); return fn()
            else return true
      end
    end
   
    local function parsePostfix()
      local fn = exprPostTab[ttype]
      if fn then accept(); return fn()
            else return true
      end
    end
   
    local function parseCall()
      local fn = exprCallTab[ttype]
      if fn then accept(); return fn()
            else invent "()"; return parsePostfix()
      end
    end
  
  -- In Lua, the simple term production only occurs on the lhs of assignments
  -- and in call statements.
    local function parsePostfixOnly(orthis)
      local fn = exprPostOnlyTab[ttype]
      if fn then if orthis == "var" then useVar() end; accept(); return fn()
            else return orthis
      end
    end
    
    local function parseCallOnly()
      local fn = exprCallOnlyTab[ttype]
      if fn then accept(); return fn()
            else invent "()"; return parsePostfixOnly "call"
      end
    end
  
    local function parseSimpleTermStartingParen()
      mustParseExpr()
      mustMatch ")"
      return parsePostfixOnly "expr"
    end
    
    local function parseSimpleTermStartingId()
      return parsePostfixOnly "var"
    end
  
    local function parseExprList()
      if parseExpr() then while match "," do mustParseExpr() end end
      return true
    end
  
    local function mustParseExprList()
      if parseExpr() then while match "," do mustParseExpr() end
                     else invent "<expr>"
      end
    end
    
    local function parseTableConstructor()
      repeat
        local doit, isId = mustParseExpr, match "id"
        if isId then if peek "=" then setTag "keydef"; accept()
                                 else useVar(); doit = parsePostfix
                     end
                 elseif match "[" then mustParseExpr(); mustMatch "]"; mustMatch "="
                 elseif match "}" then return true
        end
        doit()
      until not (match "," or match ";")
      mustMatch "}"
      return true
    end

  ----------------------------------------------------------------------------------------------------
  -- Statement parsing
  ----------------------------------------------------------------------------------------------------  

    local stmtTab = {}
  
    -- The logic below is pretty specific; all the expect sets that do not contain
    -- end are singleton sets (expectUntil and expectEOF). Madly inventing block
    -- terminators might be a really bad idea, but it is hard to come up with anything
    -- more general.
    -- 
    -- Cases:
    -- 1.  "if foo then return x -- missing end -- rest of chunk"
    --     "if foo then return x -- extra statement -- end"
    -- 
    -- 2.  "repeat if x then if y then break end end -- extra end -- until nomore()"
    --     "repeat if x then if y then break end end -- missing until -- rest of chunk"
    --        
    -- In the first case, we invent the end, although it may not be the wisest thing;
    -- it seems like the more probable error. In the second case, we skip the end on
    -- the basis that it is likely to get us into less trouble (although it is possible
    -- that the error was repeat something while, which we're not going to come close
    -- to detecting.
    --
    -- If we hit an unexpected EOF, we *must* invent something. Unless we're in a 
    -- repeat block, we can get away with inventing an end, so we go with the
    -- same heuristic as before: hence the following function.
    
    local function inventSomething(expect)
      local invented = "end"
      if not expect[invented] then for k in expect do invented = k; break end end
      invent(invented)
      return invented
    end
  
    local function popScope(tt)
      setTagData(currentScope)
      currentScope = currentScope.parent
      return tt
    end
  
    -- "notend" here means something that is not a mandatory last statement, i.e. return or break.
    -- Perhaps that was a bad choice of terminology.
    local function parseChunk(expect)
      local endfound
      repeat
        for notend in matchSet, stmtTab do
          match ";"
          if notend then
            if endfound then return popScope(inventSomething(expect)) end
           else
            endfound = true
          end
        end
        -- we have found something that is not a statement. If we were looking for it, fine.
        -- Otherwise, we want to synch the parse; we use all statements plus all terminators
        -- as synchronisers. Possibly, we shouldn't use a terminator which is not on the stack.
        if expect[ttype] then
          local tt = ttype
          accept()
          return popScope(tt)
        end
        if ttype == "EOF" then return popScope(inventSomething(expect)) end
        newTag("skip", "")
        repeat accept() until synch[ttype]
        lasttag = tagAfter("endskip", "")
      until nil -- This repeat is terminated by returning.
    end
  
    -- This depends on the fact that the ellipsis and colon flags are actually tag indices
    local function parseFunctionBody(scope, colon)
      local nvar, ellipsis
      currentScope = scope
      if colon then 
        newVar("self", scope, colon)
      end
      if match "(" then
        nvar, ellipsis = matchVarList(currentScope)
        mustMatch ")"
       else invent "()"
      end
      if ellipsis then newVar("arg", scope, ellipsis) end
      parseChunk(expectEnd)
      return true
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Fill in the tables now that everything is defined
  ----------------------------------------------------------------------------------------------------  
    for k in unop do exprValTab[k] = mustParseExpr end
    for k in const do exprValTab[k] = parseBinop end
    for k in binop do exprPostTab[k] = mustParseExpr; exprBinopTab[k] = mustParseExpr end
    do
      local function fakeEqual() setTag "meantequal"; return mustParseExpr() end
      exprPostTab["="] = fakeEqual
      exprBinopTab["="] = fakeEqual
    end
  
    exprValTab["{"] =         function() parseTableConstructor();         return parseBinop() end
    exprValTab["function"] =  function()
                                local scope = newScope(currentScope, true)
                                setTagData(scope)
                                parseFunctionBody(scope)
                                                                          return parseBinop() end
    exprValTab["("] =         function() mustParseExpr(); mustMatch ")";  return parsePostfix() end
    exprValTab["id"] =        function() useVar();                        return parsePostfix() end
    
    exprPostTab["string"] =                                                      parsePostfix
    exprPostTab["{"] =        function() parseTableConstructor();         return parsePostfix() end
    exprPostTab["("] =        function() parseExprList(); mustMatch ")";  return parsePostfix() end
    exprPostTab["["] =        function() mustParseExpr(); mustMatch "]";  return parsePostfix() end
    exprPostTab["."] =        function() mustMatchKey();                  return parsePostfix() end
    exprPostTab[":"] =        function() mustMatchKey();                  return parseCall() end
   
    exprCallTab["string"] =                                                      parsePostfix
    exprCallTab["{"] =        function() parseTableConstructor();         return parsePostfix() end 
    exprCallTab["("] =        function() parseExprList(); mustMatch ")";  return parsePostfix() end
  
    exprPostOnlyTab["string"] = function()                                return parsePostfixOnly "call" end  
    exprPostOnlyTab["{"] =    function() parseTableConstructor();         return parsePostfixOnly "call" end
    exprPostOnlyTab["("] =    function() parseExprList(); mustMatch ")";  return parsePostfixOnly "call" end
    exprPostOnlyTab["["] =    function() mustParseExpr(); mustMatch "]";  return parsePostfixOnly "index" end
    exprPostOnlyTab["."] =    function() mustMatchKey();                  return parsePostfixOnly "key" end
    exprPostOnlyTab[":"] =    function() mustMatchKey();                  return parseCallOnly() end
    
    exprCallOnlyTab["string"] = function()                                return parsePostfixOnly "call" end
    exprCallOnlyTab["{"] =    function() parseTableConstructor();         return parsePostfixOnly "call" end 
    exprCallOnlyTab["("] =    function() parseExprList(); mustMatch ")";  return parsePostfixOnly "call" end
  
    -- The logic here is byzantine at best. Sigh.
    -- parseSimpleTermStarting{Id, Paren} return one of expr, var, call, index, or key, depending
    -- on what the last postfix operator was. expr and var can only be returned if there was
    -- no postfix operator; the first is always an error. Since the last token will have been
    -- accept()'ed, lasttag refers to it, whatever it might be. This is only use in the cases of
    -- key and var. (index refers to the case [...]).
    do
      local callOrAssignTab
      
      local function parseAssignOrCall(termParser)
        if not callOrAssignTab[termParser()]() then
          while match "," do
            if match "id" then termParser = parseSimpleTermStartingId
             elseif match "(" then termParser = parseSimpleTermStartingParen
             else invent "= <val>"; return -- resynch
            end
            if callOrAssignTab[termParser()](true) then invent ".<key>" end
          end
          mustMatch "="
          mustParseExprList()
        end
        return true
      end
  
      -- return true if it is a call, possibly invented
      callOrAssignTab = {
        expr = function(wantsAssign) if not wantsAssign then invent "()" end; return true end,
        var = function() useVar(true) end,
        key = function() defKey() end,
        index = alwaysNil,
        call = alwaysTrue
      }
  
      stmtTab["("] = function() return parseAssignOrCall(parseSimpleTermStartingParen) end
      stmtTab["id"] = function() return parseAssignOrCall(parseSimpleTermStartingId) end
    end
      
    stmtTab["break"] = function()
      local s = currentScope.vars["break"]
      if s and s.scope.fnDepth == currentScope.fnDepth then
        setTagData(s.scope)
       else
        setTag "invalid"
      end
      return false
    end
    
    stmtTab["return"] = function()
      setTagData(currentScope.vars["return"].scope)
      parseExprList()
      return false
    end
  
    local function parseDo(scope)
      currentScope = scope or newScope(currentScope)
      setTagData(currentScope)
      parseChunk(expectEnd)
      return true
    end
    
    stmtTab["do"] = parseDo
    
    stmtTab["for"] = function()
      local scope = newScope(currentScope)
      newVar("break", scope)
      setTagData(scope)
      local nvar, ellipsis = matchVarList(scope)
      if ellipsis then setTag "invalid" end
      if nvar == 0 then invent "<var>"; nvar = 1 end
      if match "in" then setTagData(scope)
       elseif match "=" then
        setTagData(scope)
        if nvar ~= 1 then setTag "invalid" end
       else invent "in" -- take a wild guess
      end
      mustParseExprList()
      if mustMatch "do" then setTag "doclause" end
      return parseDo(scope)
    end
    
    stmtTab["while"] = function()
      local scope = newScope(currentScope)
      newVar("break", scope)
      setTagData(scope)
      mustParseExpr()
      if mustMatch "do" then setTag "doclause" end
      return parseDo(scope)
    end
  
    stmtTab["repeat"] = function()
      currentScope = newScope(currentScope)
      newVar("break", currentScope)
      setTagData(currentScope)
      parseChunk(expectUntil)
      mustParseExpr()
      return true
    end
  
    -- mustMatchKey returns the key's tag (even if the key was invented).
    local dotColonTab = {
      ["."] = function() mustMatchKey(); return false end,
      [":"] = mustMatchKey
    }
    
    stmtTab["function"] = function()
      local scope = newScope(currentScope, true)
      setTagData(scope)
      local hasColon, isIndex 
      mustMatch "id"
      useVar(not dotColonTab[ttype])
      for colonTag in matchSet, dotColonTab do
        isIndex = true
        if colonTag then hasColon = colonTag; break end
      end
      if isIndex then defKey() end
      return parseFunctionBody(scope, hasColon)
    end
  
    -- TODO Reread the todo at the beginning of the file.
    -- The problem here is that the setTagData might get applied to a missing tag, if error
    -- recovery has inserted one. So we really only want to setTagData if the tag was for real.
    -- But there is probably a better way to do that.
    stmtTab["if"] = function()
      local clause
      local idScope = newScope(currentScope)
      repeat
        setTagData(idScope)
        currentScope = idScope
        mustParseExpr()
        mustMatch "then"
        setTagData(idScope)
        currentScope = newSemiScope(idScope)
        clause = parseChunk(expectElseif)
      until clause ~= "elseif"
      if clause == "else" then
        setTagData(idScope)
        currentScope = newSemiScope(idScope)
        parseChunk(expectEnd)
      end
      setTagData(idScope)
      currentScope = idScope.parent
      return true
    end
  
    stmtTab["local"] = function()
      local defScope = newSemiScope(currentScope)
      setTagData(defScope)
      if match "function" then
        currentScope = newScope(defScope, true)
        setTagData(currentScope)
        matchNewVar(defScope)
        -- FIXME This was another call to newScope but I am almost certain that was wrong.
        return parseFunctionBody(currentScope)
       else
        local nvar, ellipsis = matchVarList(defScope)
        if ellipsis then setTag "invalid"  end
        if nvar == 0 then invent "<var>"; nvar = 1 end
        if match "=" then mustParseExprList() end
        currentScope = defScope
      end
      return true
    end

  ----------------------------------------------------------------------------------------------------
  -- Actually parse the input
  ----------------------------------------------------------------------------------------------------  

    next()
    parseChunk(expectEOF)
    Tag.n, Tagval.n = ntag, ntag
    return Tag, Tagval
  end

  local function fromTag(t, _) return t end
  local function fromData(_, td) return td end
  local function fromName(_, td) return td.name end
  
  local haveScope = setFrom [[
    break    do       else     elseif   end      for
    function if       in       local    repeat   return
    then     until    while    doclause EOF
  ]]
  
  local display = {}
  
  setMulti(display, fromData, words [[
     key      keydef     number   escape   string
     comment  preprocessor        whitespace
     invalid  meantequal missing  id
  ]])
  
  setMulti(display, fromTag, words [[
     +        -        *        /        ^        =
     ~=       <=       >=       <        >        ==
     (        )        {        }        [        ]
     ;        :        ,        .        ..       ...
     and      false    nil      not      or       true
     break    do       else     elseif   end      for
     function if       in       local    repeat   return
     then     until    while
  ]])
  
  setMulti(display, fromName, words [[var varset]])
  
  setMulti(display, alwaysEmpty, words [[EOF skip endskip]])
  
  function display.doclause() return "do" end
  
  setmetatable(display, {__index = function(_, k)
                                     error ("Internal error in luaparser: returned tag '"..tostring(k).."'")
                                   end})

  -- Do not call these within a resynchronisation sequence!  
  function pkg.textFor(t, td) return display[t](t, td) end
  function pkg.scopeFor(t, td) return haveScope[t] and td end

  -- Although this is only for debugging, you never know when they might not be useful.
  
  function pkg.show(Tag, TagData)
    for i = 1, Tag.n do
      io.write(string.format("%s: %q\t%q\n", Tag[i], pkg.textFor(Tag[i], TagData[i]), tostring(TagData[i])))
    end
  end

  return pkg
end

--
-- This parser was written for colourising/linting, and takes a rather SAX like approach, which
-- is possibly less convenient for semantic parsing. It should be possible to completely
-- reconstruct the input from the results of parsing.
--
-- It is called with a lexer (created by blex) and a globals table.
--
-- Two vectors are returned, tag and tagdata, both indexed by tag number. tag is the tagtype;
-- tagdata is some piece of relevant information. All tokens have "text"; you can get this
-- with .textFor(tag[i], tagdata[i]). Many tokens have a reference to a scope; the function
-- .scopeFor(tag[i], tagdata[i]) will return the scope if it is a token-type with a scope
-- annotation.
--
--
-- Note: within a skip range, these functions will not work; the text is always tagdata[i]
--       and there is no scope even if the tagtype "should" have on. These interfaces are
--       under review in an attempt to make this less error-prone.
--
-- Herewith a complete list of tags and tagdatas:
--
-- Lua punctuation, tag and tagdata is identical
--    +        -        *        /        ^        =
--    ~=       <=       >=       <        >        ==
--    (        )        {        }        [        ]
--    ;        :        ,        .        ..       ...
-- Lua keywords whose tag and tagdata are identical
--    and      false    nil      not      or     true
-- Lua keywords whose tagdata is a scope
--    break    do       else     elseif   end      for
--    function if       in       local    repeat   return
--    then     until    while
--                   scope objects have the following keys:
--                     .depth     scope nesting depth (global = 0)
--                     .fnDepth   function nesting depth
--                     .global    true if a (the) global scope
--                     .parent    the parent scope (see note)
--                     .vars      a table of vars defined in this scope
-- Pseudo keywords whose tagdata is a scope
--    doclause       used to distinguish between the do statement
--                   and the do clause in for and while statements
--    EOF            the last tag
-- Variables, tagdata is a var object.
--    var      varset
--                   varset is used if the var is being assigned to.
--                   A var object is a table with the following keys:
--                     .name      variable name
--                     .scope     scope in which declared
--                     .used      true if the value has been accessed
--                     .set       true if the value has been changed
--                                (local foo=3 sets neither used nor set)
--                     .closed    true if the variable is referenced in
--                                an interior function scope.
--                     .def       The tag index for the declaration of local variables,
--                                including implicit local variables (self and arg).
--                                WARNING: Subject to change.
--                                .def is currently not always a number; global variables have
--                                .def set to "STDLIB" or nil; "return" and "break" pseudovariables
--                                have .def set to nil.
--                     .global    true if the variable is global.
-- Constant types whose tagdata is the constant
--    key      keydef   number
--                   key is used for barewords which are actually table keys (other than global
--                   variables). keydef is used instead if the key is being used in an assignment of
--                   some kind (including table construction).
--                   A keydef tag might be the target of a var object (in the case of implicit "self")
--                   This is not flagged, so all keydefs should be considered potential targets.
-- Strings
--    escape         A \-escape sequence
--    string         the entire string for long strings, or a part of a short string. In both cases,
--                   this includes the string delimiter (', " or [[ ]])
--
--                   If there is missing delimiter, a missing token will be inserted (with the
--                   missing delimiter in its tagdata) followed by a string token with an empty
--                   tagdata.
--
--                   In Lua, it is valid to have two strings in a row, but there is no way of
--                   distinguishing this in the parser output. This might be considered a bug.
-- Syntactic non-semantic objects whose tagdata is the input
--    comment  preprocessor  whitespace
-- Error markers
--    meantequal     The input was = where it probably should have been ==. The tagdata is =
--    missing        one or more pseudo-tokens inserted to make the parse work.
--                   Under some circumstances, it may be necessary to insert whitespace
--                   before or after a missing token.
--    invalid        an unrecognised character, a sequence of at least four dots, three dots
--                   in a context where ellipses are not permitted, an "=" in a "for" statement
--                   with more than one control variable, or a break outside of a loop construct.
--                   The tagdata is the original input.
--                   In the second last case, it is not clear that it is the "=" that was the
--                   error, but the parser is too lazy to go back and find the erroneous
--                   control variables.
-- Parse Resynchronisation
--    endskip  id       skip
--                   Parse resynchronisation takes place when things get really confused.
--                   During resynchronisation,  tokens are skipped until a synchronising symbol
--                   is found; this is generally a statement keyword.
--                   skip and endskip markers (with empty tagdata) surround the resynchronisation
--                   sequence. All tags inside the sequence have their input in the tagdata.
--                   During resynchronisation, it is impossible to distinguish between variables and
--                   keys, so both of them are tagged as id.
-- Things which if you find them indicate that the parser is broken
--    call     expr     index
--    id (outside of a skip sequence)
--    (or anything else)
----------------------------------------------------------------------------------------------------

Produced by TNT, the Lua-linter. TNT/0.5 Copyright (C) 2004 Rici Lake