lparse.lua

-- Lua parser. v0.5

-- Copyright 2004 by Rici Lake. Permission is granted to use this code under
-- the same terms and conditions as found in the Lua copyright notice at
-- http://www.lua.org/license.html.
--
-- I'd appreciate a credit if you use this code, but I don't insist. 
--
-- This is prerelease software, no interfaces are guaranteed.
-- Use at your own risk, etc.

-- Documentation is at the end :)
  
-- TODO
--
--  The reuse of TagData for this or that or the other was a cutting of corners,
--  which as usual has come back to haunt; the handling of "missing" is a
--  perfect illustration. So the interfaces will need to change to use a
--  three-part tag: type, original, scope-or-name.
--
--  But what it really ought to do is become a coroutine-based generator.
--  The only issue here is proper handling of deferred tags (comments,
--  whitespace, skips); none of these have scope/var info, so only
--  two tables should be necessary; we'd also need to keep all info
--  for the "current tag" because we can't release it until we're done
--  with it (which is when another tag becomes current, or we're done.)
--
--  Doing a parse requires setting up a ton of tables, functions with upvalues,
--  etc. While that may not be a Bad Thing, in and of itself, it ought to be
--  possible to reuse a parser once it has all been created.
--
--  The globals table ought to have at least one level of subkeys, which
--  means the parser has to get a bit cleverer; probably all that term parsing
--  stuff should just be ripped out and replaced with a simple and complete
--  precedence parser; set/use could then be done on the expression parse tree.
--  
--  Try to remember what the hell I was thinking about when I put "see note" in
--  the tag documentation (referring to scopes) and then write the note.
--
--  If statements end up working around the interface with parseChunk and 
--  Scopes. parseChunk sets the TagData of the terminating symbol to
--  currentScope and then pops currentScope. But if statements use a fake
--  scope to keep the clauses linked, so this doesn't really work. (But it
--  is probably right everywhere else.)
--
--  Try to modularise a bit. Symtab/scope now has no external references.
--  Lexing depends only on newTag. Tagging is still deeply entrenched.
--  Perhaps this could be solved by adding newTag to the blex structure.

--  Anchor scopes the same way variables are anchored. 

--  Put the filename or some such indicator in the file scope object, so that
--  cross-file links can be constructed.

--  Try to handle SLSLPP colourising better. Right now, $ (USER) is not
--  recognised at all. The "right" way to do it would be to have two separate
--  parse streams running in parallel, although even then it will be hard to
--  get the unpreprocessed stuff right if people are doing anything even
--  moderately complicated. There is an interesting attempt to do php
--  "properly", making the HTML and the php indentations independent --
--  I saw a reference to it on comp.compilers. Maybe worth looking at.

--  Longer term: To integrate with real-time editing, we should be able to
--  tokenise incrementally, line per line. The trick would be to save the
--  lexer state for each line (there are only four states) and keep the lines
--  as vectors of tokens. Retokenising would only be needed for lines that
--  are changed (presumably, the editor would let us know) and tokenising could
--  stop at the first unchanged line where the states match.

--  Add some kind of LuaDOC comment parsing

--  Make it easier to construct parse trees, not just lint/tag

-------------------------------------------------------------------------
-- Utilities
-------------------------------------------------------------------------
local util = import "libutil"
local setFrom, setMulti, words 
      = util.setFrom, util.setMulti, util.words

local function ALWAYS(thing) return function() return thing end end
local alwaysTrue = ALWAYS(true)
local alwaysEmpty = ALWAYS ""
local function alwaysNil() end

-------------------------------------------------------------------------
-- Symbol table stuff
-------------------------------------------------------------------------
local function newVar(name, scope, anchor)
  local self = {name = name, scope = scope, def = anchor, global = scope.global}
  scope.vars[name] = self
  return self
end

local function newScope(outer, isFunction)
  local self = {
    fnDepth = outer.fnDepth + (isFunction and 1 or 0),
    depth = outer.depth + 1,
    vars = setmetatable({}, {__index = outer.vars}),
    parent = outer
  }
  if isFunction then newVar("return", self) end
  return self
end

local function newSemiScope(scope)
  return {
    fnDepth = scope.fnDepth,
    depth = scope.depth,
    vars = setmetatable({}, {__index = scope.vars}),
    parent = scope.parent
  }
end

local function lookupVar(name, scope, isSet, globalScope)
  local rv = scope.vars[name]
  if rv then
    if scope.fnDepth ~= rv.scope.fnDepth then rv.closed = not rv.global end
   else
    rv = newVar(name, globalScope)
    -- This is the only place we create global variables; the badGlobal key
    -- is here in order to help with linting. This will have to change to
    -- handle multiple file linting, where reference before define is OK.
    if not isSet or scope.fnDepth > 1 then rv.badGlobal = true end
  end
  scope.vars[name] = rv
  if isSet then rv.set = true else rv.used = true end
  return rv
end

local function newGlobalScope(globals)
  local vars = {}
  local globalScope = {
    fnDepth = 0,
    depth = 0,
    global = true,
    vars = vars
  }
  for k in globals do vars[k] = newVar(k, globalScope, "STDLIB") end
  return globalScope
end

-------------------------------------------------------------------------
-- Lua token sets                                                      --
-------------------------------------------------------------------------
local keyword = setFrom [[
  and      break    do       else     elseif   end
  false    for      function if       in       local
  nil      not      or       repeat   return   then
  true     until    while    

  +        -        *        /        ^        =
  ~=       <=       >=       <        >        ==
  (        )        {        }        [        ]
  ;        :        ,        .        ..       ...
]]

local synch = setFrom [[
  break    do       else     elseif   end      for
  function if       local    repeat   return   until
  while    EOF
]]

local expectUntil  = setFrom [[until]]
local expectElseif = setFrom [[elseif else end]]
local expectEnd    = setFrom [[end]]
local expectEOF    = setFrom [[EOF]]

return function(pkg)
  function pkg.parse(lex, globals)
    local ntag, Tag, Tagval = 0, {}, {} 
    local ttype, token, lasttag
    local luapats
  
  -------------------------------------------------------------------------------
  -- variable scoping
  -------------------------------------------------------------------------------
    local globalScope = newGlobalScope(globals)
    local currentScope = newScope(globalScope, true)
    newVar("return", currentScope)
    
  -------------------------------------------------------------------------------
  -- tagging strategy for colourising.
  -------------------------------------------------------------------------------
  -- lasttag is the last "meaningful" tag; it is not set in the case of comments
  -- or whitespace. It's not a very clean interface, really, but then it's not a
  -- very clean concept either. This really ought to be replaced by a "commit"
  -- type interface, so we can return tags when we know nothing will change.
  --
  -- newTag does not set lasttag, which allows spaces and comments to insert
  -- themselves. "skip" is also inserted with newTag.
  -- tagAfter inserts a tag after lasttag; this allows "missing" and "endskip"
  -- to get inserted in the supposedly correct place. Both of these update
  -- lasttag ("endskip" does it manually, thereby breaking encapsulation). 
  
    local function newTag(tt, val)
      ntag = ntag + 1
      Tag[ntag], Tagval[ntag] = tt, val or lex.token()
      return ntag
    end
    
    local function tagAfter(tt, val)
      local idx = lasttag + 1
      for i = ntag, idx, -1 do Tag[i + 1], Tagval[i + 1] = Tag[i], Tagval[i] end
      Tag[idx], Tagval[idx], ntag = tt, val, ntag + 1
      return idx
    end
    
    local function setTag(tt)
      Tag[lasttag] = tt
    end
  
    local function tag() return Tag[lasttag] end
  
    -- hack, this is not the right way to do things
    local function setTagData(val)
      if Tag[lasttag] ~= "missing" then
        Tagval[lasttag] = val
      end
    end
  
    local function tagData() return Tagval[lasttag] end
    
    -- FIXME Should invent really set lasttag to where the missing is?
    local function invent(toke)
      lasttag = tagAfter("missing", toke)
    end
    
    local function badchar(self)
      if self.match1"." then
        newTag "invalid"
        return self.lex(luapats)
      end
    end
    
    local function next()
      ttype, token = lex.lex(luapats, badchar)
      if not ttype then ttype = "EOF" end
    end
  
    -- accept must return lasttag (to avoid making lasttag visible in too
    -- many places) which means that match also returns lasttag (or nil)
    local function accept() lasttag = newTag(ttype, token); next(); return lasttag end
    local function peek(tt) return ttype == tt end
    local function match(tt) if ttype == tt then return accept() end end
    local function mustMatch(tt) if ttype == tt then return accept() else invent(tt) end end
    local function matchSet(ttab) local tt = ttab[ttype]; if tt then accept(); return tt() end end
  
    -- FIXME should this be var or varset? I think var is correct.
    local function matchNewVar(scope)
      if ttype == "id" then
       lasttag = newTag("var", token)
       setTagData(newVar(token, scope, lasttag))
       next()
       return true
      end
    end
    
    -- This returns lasttag for the benefit of dotColonTab
    local function mustMatchKey()
      if ttype == "id" then lasttag = newTag("key", token); next()
                       else invent "<key>"
      end
      return lasttag
    end
  
  -- for ... , local ... and function ( ... ) are all slightly different.
  -- but we use this one for all of them anyway. Returns #var, has_...
    local function matchVarList(scope)
      if not matchNewVar(scope) then return 0, match "..." end
      local rv = 1
      while match "," do
        rv = rv + 1
        if not matchNewVar(scope) then
          if match "..." then return rv - 1, lasttag
                         else invent "<var>"
          end
        end
      end
      return rv
    end
    
    -- We need to test the tag because the "key" might have been invented.
    local function defKey() if tag() == "key" then setTag "keydef" end end
    local function useVar(isDef)
      setTag(isDef and "varset" or "var")
      setTagData(lookupVar(tagData(), currentScope, isDef, globalScope))
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Lexer
  ----------------------------------------------------------------------------------------------------
  
  -- TODO: The whole point of this block is to create luapats; the only upvalues here are the
  --       constant table keyword and the function newTag. <keyword> could actually be defined
  --       here, since it is the only place it is used. (It also refers to a few utility
  --       functions). So: how do we parameterise this with newTag and export it as a 
  --       separate lexing package?
    do
      local function numberExtend(self)
        self.match1("^E[-+]?%d+")
        return "number"
      end
      
      local escapepats = {
         ["^\\%d%d?%d?"]           = ALWAYS "escape",
         ["^\\."]                  = ALWAYS "escape",
         ["^[^'\"\\\n]*"]          = ALWAYS "string",
         ["^([\"'])"]              = function(_, c) return "string", c end
      }
  
  -- Changing self.tokens to self.extends is most of what is needed to consolidate
  -- strings, if that is what is wanted. (Plus copying the error stuff from longExtend) 
      local function stringExtender(q)
        return function(self)
          newTag "string"
          for f, r in self.tokens(escapepats, alwaysNil) do
            if r == q then return "string"
                      else newTag(f)
            end
          end
          newTag("missing", q)
          return "string"
        end
      end
      
      local function longExtend(self, ttype)
        local depth = 1
        while depth > 0 do 
          self.match1("^[^%[%]]*")
          if      self.match1("^%[%[") then depth = depth + 1
           elseif self.match1("^%]%]") then depth = depth - 1
           elseif not self.match1(".") then
            newTag(ttype)
            newTag("missing", string.rep("]]", depth))
            -- Zap the token. cursor is one past the end of the input.
            self.setToken(-1)
            break
          end
        end
        return ttype
      end
      
      local function checkSymbol(self, op)
        if keyword[op] then return op, op
                       else newTag("invalid", op); return self.lex(luapats, badchar)
        end
      end
      
      -- defined local above  
      luapats = {
         ["^([%a_][%w_]*)"]        =  function(_, word) return (keyword[word] and word or "id"), word end,
         ["^%s+"]                  =  function(self)
                                        newTag "whitespace"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^%-%-"]                 =  function(self)
                                        if self.match1("^%[%[") then
                                          longExtend(self, "comment")
                                         else self.match1 "[^\n]*"
                                         --else repeat self.match1 "[^\n]*" until not self.match1 "^%s*%-%-"
                                        end
                                        newTag "comment"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^#"]                    =  function(self)
                                        self.match1 "[^\n]*"
                                        newTag "preprocessor"
                                        return self.lex(luapats, badchar)
                                      end,
         ["^%[%["]                 =  function(self) return longExtend(self, "string") end,
         ["^%d+%.?%d*"]            =  numberExtend,
         ["^%.%d+"]                =  numberExtend,
         ["^([(){}%[%]+%-*/^;:,])"]=  checkSymbol,
         ["^([<>=~]=?)"]           =  checkSymbol,
         ["^(%.+)"]                =  checkSymbol,
         ["^'"]                    =  stringExtender "'",
         ['^"']                    =  stringExtender '"',
      }
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Expression parsing
  ----------------------------------------------------------------------------------------------------  
  
    -- Expression parsing has the following contexts:
    -- 1. (expr)   unop / string / { tC / function fB / number / const / id / ( expr ) / other
    --               1      3        3        3            3       3      2       2      need <expr>
    -- 2. (postfix) string / { tC / ( exprlist ) / . tag /  [ expr ] / : tag / binop / other
    --                2        2          2           2         2        4       1     accept
    -- 3. (binop)   binop  / other
    --                1      accept
    -- 4. (call)    string / { tC / ( exprlist ) / other
    --                 2       2         2         need ()
    --
    -- In the statement case, binops are not permitted, and state 1 never reappears, while state 3 reduces
    -- to accept. So we define states 2* and 4* (PostfixOnly and CallOnly).
    
    local binop = setFrom [[+ - * / ^ ~= <= >= < > == .. and or]]
    local unop  = setFrom [[- not]]
    local const = setFrom [[number string false nil true]]
    
    local exprValTab, exprBinopTab, exprPostTab, exprCallTab, exprPostOnlyTab, exprCallOnlyTab =
          {}, {}, {}, {}, {}, {}
  
    local function parseExpr() return matchSet(exprValTab) end
    -- The following are defined to allow for tailcalling.
    local function mustParseExpr()
      local fn = exprValTab[ttype]
      if fn then accept(); return fn()
            else invent "<expr>"
      end
    end
   
    local function parseBinop()
      local fn = exprBinopTab[ttype]
      if fn then accept(); return fn()
            else return true
      end
    end
   
    local function parsePostfix()
      local fn = exprPostTab[ttype]
      if fn then accept(); return fn()
            else return true
      end
    end
   
    local function parseCall()
      local fn = exprCallTab[ttype]
      if fn then accept(); return fn()
            else invent "()"; return parsePostfix()
      end
    end
  
  -- In Lua, the simple term production only occurs on the lhs of assignments
  -- and in call statements.
    local function parsePostfixOnly(orthis)
      local fn = exprPostOnlyTab[ttype]
      if fn then if orthis == "var" then useVar() end; accept(); return fn()
            else return orthis
      end
    end
    
    local function parseCallOnly()
      local fn = exprCallOnlyTab[ttype]
      if fn then accept(); return fn()
            else invent "()"; return parsePostfixOnly "call"
      end
    end
  
    local function parseSimpleTermStartingParen()
      mustParseExpr()
      mustMatch ")"
      return parsePostfixOnly "expr"
    end
    
    local function parseSimpleTermStartingId()
      return parsePostfixOnly "var"
    end
  
    local function parseExprList()
      if parseExpr() then while match "," do mustParseExpr() end end
      return true
    end
  
    local function mustParseExprList()
      if parseExpr() then while match "," do mustParseExpr() end
                     else invent "<expr>"
      end
    end
    
    local function parseTableConstructor()
      repeat
        local doit, isId = mustParseExpr, match "id"
        if isId then if peek "=" then setTag "keydef"; accept()
                                 else useVar(); doit = parsePostfix
                     end
                 elseif match "[" then mustParseExpr(); mustMatch "]"; mustMatch "="
                 elseif match "}" then return true
        end
        doit()
      until not (match "," or match ";")
      mustMatch "}"
      return true
    end

  ----------------------------------------------------------------------------------------------------
  -- Statement parsing
  ----------------------------------------------------------------------------------------------------  

    local stmtTab = {}
  
    -- The logic below is pretty specific; all the expect sets that do not contain
    -- end are singleton sets (expectUntil and expectEOF). Madly inventing block
    -- terminators might be a really bad idea, but it is hard to come up with anything
    -- more general.
    -- 
    -- Cases:
    -- 1.  "if foo then return x -- missing end -- rest of chunk"
    --     "if foo then return x -- extra statement -- end"
    -- 
    -- 2.  "repeat if x then if y then break end end -- extra end -- until nomore()"
    --     "repeat if x then if y then break end end -- missing until -- rest of chunk"
    --        
    -- In the first case, we invent the end, although it may not be the wisest thing;
    -- it seems like the more probable error. In the second case, we skip the end on
    -- the basis that it is likely to get us into less trouble (although it is possible
    -- that the error was repeat something while, which we're not going to come close
    -- to detecting.
    --
    -- If we hit an unexpected EOF, we *must* invent something. Unless we're in a 
    -- repeat block, we can get away with inventing an end, so we go with the
    -- same heuristic as before: hence the following function.
    
    local function inventSomething(expect)
      local invented = "end"
      if not expect[invented] then for k in expect do invented = k; break end end
      invent(invented)
      return invented
    end
  
    local function popScope(tt)
      setTagData(currentScope)
      currentScope = currentScope.parent
      return tt
    end
  
    -- "notend" here means something that is not a mandatory last statement, i.e. return or break.
    -- Perhaps that was a bad choice of terminology.
    local function parseChunk(expect)
      local endfound
      repeat
        for notend in matchSet, stmtTab do
          match ";"
          if notend then
            if endfound then return popScope(inventSomething(expect)) end
           else
            endfound = true
          end
        end
        -- we have found something that is not a statement. If we were looking for it, fine.
        -- Otherwise, we want to synch the parse; we use all statements plus all terminators
        -- as synchronisers. Possibly, we shouldn't use a terminator which is not on the stack.
        if expect[ttype] then
          local tt = ttype
          accept()
          return popScope(tt)
        end
        if ttype == "EOF" then return popScope(inventSomething(expect)) end
        newTag("skip", "")
        repeat accept() until synch[ttype]
        lasttag = tagAfter("endskip", "")
      until nil -- This repeat is terminated by returning.
    end
  
    -- This depends on the fact that the ellipsis and colon flags are actually tag indices
    local function parseFunctionBody(scope, colon)
      local nvar, ellipsis
      currentScope = scope
      if colon then 
        newVar("self", scope, colon)
      end
      if match "(" then
        nvar, ellipsis = matchVarList(currentScope)
        mustMatch ")"
       else invent "()"
      end
      if ellipsis then newVar("arg", scope, ellipsis) end
      parseChunk(expectEnd)
      return true
    end
  
  ----------------------------------------------------------------------------------------------------
  -- Fill in the tables now that everything is defined
  ----------------------------------------------------------------------------------------------------  
    for k in unop do exprValTab[k] = mustParseExpr end
    for k in const do exprValTab[k] = parseBinop end
    for k in binop do exprPostTab[k] = mustParseExpr; exprBinopTab[k] = mustParseExpr end
    do
      local function fakeEqual() setTag "meantequal"; return mustParseExpr() end
      exprPostTab["="] = fakeEqual
      exprBinopTab["="] = fakeEqual
    end
  
    exprValTab["{"] =         function() parseTableConstructor();         return parseBinop() end
    exprValTab["function"] =  function()
                                local scope = newScope(currentScope, true)
                                setTagData(scope)
                                parseFunctionBody(scope)
                                                                          return parseBinop() end
    exprValTab["("] =         function() mustParseExpr(); mustMatch ")";  return parsePostfix() end
    exprValTab["id"] =        function() useVar();                        return parsePostfix() end
    
    exprPostTab["string"] =                                                      parsePostfix
    exprPostTab["{"] =        function() parseTableConstructor();         return parsePostfix() end
    exprPostTab["("] =        function() parseExprList(); mustMatch ")";  return parsePostfix() end
    exprPostTab["["] =        function() mustParseExpr(); mustMatch "]";  return parsePostfix() end
    exprPostTab["."] =        function() mustMatchKey();                  return parsePostfix() end
    exprPostTab[":"] =        function() mustMatchKey();                  return parseCall() end
   
    exprCallTab["string"] =                                                      parsePostfix
    exprCallTab["{"] =        function() parseTableConstructor();         return parsePostfix() end 
    exprCallTab["("] =        function() parseExprList(); mustMatch ")";  return parsePostfix() end
  
    exprPostOnlyTab["string"] = function()                                return parsePostfixOnly "call" end  
    exprPostOnlyTab["{"] =    function() parseTableConstructor();         return parsePostfixOnly "call" end
    exprPostOnlyTab["("] =    function() parseExprList(); mustMatch ")";  return parsePostfixOnly "call" end
    exprPostOnlyTab["["] =    function() mustParseExpr(); mustMatch "]";  return parsePostfixOnly "index" end
    exprPostOnlyTab["."] =    function() mustMatchKey();                  return parsePostfixOnly "key" end
    exprPostOnlyTab[":"] =    function() mustMatchKey();                  return parseCallOnly() end
    
    exprCallOnlyTab["string"] = function()                                return parsePostfixOnly "call" end
    exprCallOnlyTab["{"] =    function() parseTableConstructor();         return parsePostfixOnly "call" end 
    exprCallOnlyTab["("] =    function() parseExprList(); mustMatch ")";  return parsePostfixOnly "call" end
  
    -- The logic here is byzantine at best. Sigh.
    -- parseSimpleTermStarting{Id, Paren} return one of expr, var, call, index, or key, depending
    -- on what the last postfix operator was. expr and var can only be returned if there was
    -- no postfix operator; the first is always an error. Since the last token will have been
    -- accept()'ed, lasttag refers to it, whatever it might be. This is only use in the cases of
    -- key and var. (index refers to the case [...]).
    do
      local callOrAssignTab
      
      local function parseAssignOrCall(termParser)
        if not callOrAssignTab[termParser()]() then
          while match "," do
            if match "id" then termParser = parseSimpleTermStartingId
             elseif match "(" then termParser = parseSimpleTermStartingParen
             else invent "= <val>"; return -- resynch
            end
            if callOrAssignTab[termParser()](true) then invent ".<key>" end
          end
          mustMatch "="
          mustParseExprList()
        end
        return true
      end
  
      -- return true if it is a call, possibly invented
      callOrAssignTab = {
        expr = function(wantsAssign) if not wantsAssign then invent "()" end; return true end,
        var = function() useVar(true) end,
        key = function() defKey() end,
        index = alwaysNil,
        call = alwaysTrue
      }
  
      stmtTab["("] = function() return parseAssignOrCall(parseSimpleTermStartingParen) end
      stmtTab["id"] = function() return parseAssignOrCall(parseSimpleTermStartingId) end
    end
      
    stmtTab["break"] = function()
      local s = currentScope