libquery.lua

-- Miscellaneous HTML/HTTP Utilities

-- copyright 2004 by Rici Lake. Permission is granted to use this code under
-- the same terms and conditions as found in the Lua copyright notice at
-- http://www.lua.org/license.html.
--
-- I'd appreciate a credit if you use this code, but I don't insist. 
--
-- This is prerelease software, no interfaces are guaranteed.
-- Use at your own risk, etc.

--TODO
--  This really needs to get split into several libraries.
--  I'm just not sure what should go where.
-- 
--  There should be a better way to handle cgiGetQuery which is compatible
--  with the getQuery you would use if it were not a CGI environment.
--
--  Write test units for the unicode stuff and see if it actually works

local string = import "string"
local strfind, gfind, gsub, strchar, strbyte, strsub, strlen 
      = string.find, string.gfind, string.gsub, string.char, string.byte,
        string.sub, string.len
      
local table = import "table"
local tinsert = table.insert

-- there really ought to be a wrapper for modf as well as fmod      
local math = import "math"
local mod = math.mod

local util = import "libutil"
local tableFrom = util.tableFrom

local entity = tableFrom   [[   & => &amp;  < => &lt;   > => &gt;]]
local unentity = tableFrom [[ amp => &     lt => <     gt => >   quot => "]]

local function X(str) return tonumber(str, 16) end

local lastUtf8 = X"10FFFF"
local firstSurrogate = X"D800"
local lastSurrogate = X"DFFF"
local elevenBits = X"7FF"
local sixteenBits = X"FFFF"

local offset2 = X"C0" * 64 + X"80"
local offset3 = X"E0" * 4096 + X"80" * (64 + 1)
local offset4 = X"F0" * 262144 + X"80" * (4096 + 64 + 1)

 
return function(query)

  ------ HTML headers and so on ---
  
  function query.xmldecl(enc) 
    return [[<?xml version="1.0" encoding="]]
           .. (enc or "ISO-8859-1")
           .. [["?>]]
  end

--[[
  XHTML 1.1 documents should an XML declaration. The default is
  UTF-8 or UTF-16. They also must "designate the XHTML namespace":
--]]
  query.xhtmlnamespace = [[xmlns="http://www.w3.org/1999/xhtml"]]

--[[
  Undoubtedly someone understands this better than I do. I have been
  too intimidated to actually use XHTML 1.1 up to now. But allegedly
  this will work
--]]

  query.xhtml11 =
[[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
     "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
]]

--[[
    XHTML 1.0 DTDs are the same as HTML 4.01 DTDs "except for changes
    due to the differences between XML and SGML". But the identifiers
    seem to be more consistent, too.
    
    In theory, these also ought to have a xmlns declaration in the
    <html> element.
--]]

  query.xhtml10strict =
[[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
]]

  query.xhtml10transitional =
[[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
]]

  query.xhtml10frameset = 
[[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
]]
  
--[[
    "This is HTML 4.01 Strict DTD, which excludes the presentation 
     attributes and elements that W3C expects to phase out as 
     support for style sheets matures. Authors should use the Strict
     DTD when possible, but may use the Transitional DTD when support
     for presentation attribute and elements is required."
--]]

  query.html401strict = 
[[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
            "http://www.w3.org/TR/html4/strict.dtd">
]]

--[[
    "This is the HTML 4.01 Transitional DTD, which includes
     presentation attributes and elements that W3C expects to phase out
     as support for style sheets matures. Authors should use the Strict
     DTD when possible, but may use the Transitional DTD when support
     for presentation attribute and elements is required."
--]]
  
  query.html401transitional = 
[[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
     "http://www.w3.org/TR/html4/loose.dtd">
]]

--[[
    "This is the HTML 4.01 Frameset DTD, which should be
     used for documents with frames. This DTD is identical
     to the HTML 4.01 Transitional DTD except for the
     content model of the "HTML" element: in frameset 
     documents, the "FRAMESET" element replaces the "BODY" 
     element."
--]]

  query.html401frameset = 
[[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
            "http://www.w3.org/TR/html4/frameset.dtd">
]]

  ------ QUOTERS AND DEQUOTERS -----
  -- Why are there so many quoting formats, I want to know????
  -- TODO: rewrite all of these in C.
  
  -- Change pluses to spaces (queries)
  -- I wanted to call this "nonplus" (The query was nonplussed) but
  -- "un" is more consistent.
  
  local function unplus(s) return (gsub(s, "+", " ")) end
  query.unplus = unplus
  
  -- Change %xx to octet (urls)
  
  local function unpercent(s) return (gsub(s, "%%(.?.?)",
    function(x)
      assert(strfind(x, "^%x%x$"),
         "400 Bad Request, invalid %-escape in URL")
      return strchar(tonumber(x, 16))
    end))
  end
  query.unpercent = unpercent
  
  -- Both of the above (queries)
  -- Note: You cannot use unescape if the query has no = in it.
  local function unescape(s) return unpercent(unplus(s)) end
  query.unescape = unescape

  ------ UNICODE STUFF -----
  
  -- Convert an integer to a UTF-8 sequence, without checking for
  -- invalid codes.
  -- This originally had calls to floor scattered about but it is
  -- not necessary: string.char does a "C" conversion from float to int,
  -- which is a truncate towards zero operation; i must be non-negative,
  -- so that is the same as floor.
  local function toUtf8(i)
    if i <= 127 then return strchar(i)
     elseif i <= elevenBits then
      return strchar(i / 64 + 192, mod(i, 64) + 128)
     elseif i <= sixteenBits then
      return strchar(i / 4096 + 224,
                     mod(i / 64, 64) + 128,
                     mod(i, 64) + 128)
     else
      return strchar(i / 262144 + 240,
                     mod(i / 4096, 64) + 128,
                     mod(i / 64, 64) + 128,
                     mod(i, 64) + 128)
    end
  end
  query.toUtf8 = toUtf8
  
  -- Converts a UTF-8 sequence to an integer.
  -- TODO: Depending on execution order, the last case could require
  --       26 bits of precision, which is too much for single-precision
  --       floats. It should work, though, because offset 4 has only 19
  --       bits of precision, and the final result has 21.
  --       Need to check it, though.
  local function fromUtf8(str)
    if strfind(str, "^[\1-\127%z]$") then return strbyte(str)
     elseif strfind(str, "^[\194-\223][\128-\191]$") then
      return strbyte(str, 1) * 64 + strbyte(str, 2) - offset2
     elseif strfind(str, "^[\225-\236\238\239][\128-\191][\128-\191]$")
         or strfind(str, "^\224[\160-\191][\128-\191]$")
         or strfind(str, "^\237[\128-\159][\128-\191]$") then
      return strbyte(str, 1) * 4096 + strbyte(str, 2) * 64 + strbyte(str, 3)
             - offset3
     elseif strfind(str, "^\240[\144-\191][\128-\191][\128-\191]$")
         or strfind(str, "^[\241\242\243][\128-\191][\128-\191][\128-\191]$")
         or strfind(str, "^\244[\128-\143][\128-\191][\128-\191]$") then
      return (strbyte(str, 1) * 262144 - offset4)
             + strbyte(str, 2) * 4096 + strbyte(str, 3) * 64 + strbyte(str, 4)
    end
  end
  query.fromUtf8 = fromUtf8
  
  -- Returns its argument if it is a valid Unicode code point
  function query.isUnicode(i)
    return i >= 0 and i <= lastUtf8
           and not (i >= firstSurrogate and i <= lastSurrogate)
           and i
  end
  
  local function unentify1(hash, ent)
    local rv
    if hash == "" then rv = unentity[ent]
     elseif strfind(ent, "^[%d]+$") then rv = toUtf8(tonumber(ent))
     elseif strfind(ent, "^[xX][%x]+$") then
      rv = toUtf8(tonumber(strsub(ent, 2), 16))
    end
    return rv or ("&" .. hash .. ent .. ";")
  end

  -- Partial implementation of making stuff safe for HTML. This assumes
  -- that no character set conversion needs to be done.

  function query.entify(str)
    return (gsub(str, "[&<>]", function(w) return entity[w] end))
  end
  
  -- If entity declarations are available, this can be used to construct an
  -- unentity table. The standard entity tables are at
  --  http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
  --  http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
  --  http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
  -- Don't use the ones from the html4 DTDs, the regex below won't work.
  
  function query.addEntity(definition)
    for name, val in gfind(definition, 
      '\n<!ENTITY%s+(%w+)%s+"&#([xX]?%x+);"') do
      unentity[name] = unentify1(val)
    end
  end

  -- This unentifies to UTF-8. If you needed ISO-8859-1, you are out
  -- of luck. Note: we only recognize 7-bit alphanumeric entity names,
  -- which as far as I know covers all the W3C-defined ones.

  function query.unentify(str)
    return (gsub(str, "&(#?)([%w]+);", unentify1))
  end
  
  ---- QUERY HANDLING ----
  
  -- Try to do the right thing in a CGI environment. The captive web
  -- server model is somewhat different.

  -- Query readers always return something, although it might be ""
  local function queryInEnvironment()
    return os.getenv "QUERY_STRING" or ""
  end
  
  -- ignore chunked transfer encoding, hopefully no-one actually uses it.
  local function queryInBody()
    local n = tonumber(os.getenv "CONTENT_LENGTH")
    return n and n ~= 0
             and os.getenv "CONTENT_TYPE" == "application/x-www-form-urlencoded"
             and io.stdin:read(n)
           or ""
  end
         
  local queryReaders = {
    HEAD = queryInEnvironment,
    GET = queryInEnvironment,
    POST = queryInBody
  }
  
  -- This will return nil if the request wasn't a recognised method.
  -- Perhaps this isn't flexible enough, but what the heck.
  function query.cgiGetQuery()
    local reader = queryReaders[os.getenv "REQUEST_METHOD" or ""]
    return reader and reader()
  end
  
  local function qpairs(query)
    local fn, state, pair = gfind(query, "[^&]+")
    return function()
      pair = fn(state, pair)
      if pair then
        local _, _, key, val = strfind(pair, "([^=]+)=?(.*)")
        -- the only thing that will trigger this error is "[&?]=foo"
        assert(key, "403 Forbidden, missing key in URL")
        return unescape(key), unescape(val)
      end
    end
  end
  query.qpairs = qpairs
  
  -- if you know all your keys are scalar, you could use this
  function query.asTable(query)
    local rv = {}
    for k, v in qpairs(query) do rv[k] = v end
    return rv
  end
  
  -- The original CGI spec says that if a query string has no = in it,
  -- then it is a list of escaped terms separated by +. This returns the
  -- list if it qualifies. Empty terms are ignored.
  function query.isIndex(query)
    if not strfind(query, "=", 1, true) then
      local rv = {n = 0}
      for word in gfind(query, "([^+]+)") do tinsert(rv, unpercent(word)) end
      return rv
    end
  end
  
  
  ---- PATH STUFF ----
  
  -- The trouble with iterating paths is that we really need to
  -- split the unescaped path when we actually find what we want.
  
  local function nextseg(path, prev)
    local s, e, seg = strfind(path, "^/*([^/]+)", prev + 1)
    if e then return e, unpercent(seg) end
    -- this will fail when we're past the last segment, whether or not
    -- there was a trailing / (either no slash or no letters). The first
    -- time we return "", and the second time we return nil.
    local n = strlen(path)
    if prev <= n then return n + 1, "" end
  end
  
  -- iterates over segments in a path, returns index, segment where
  -- index is the end of the segment. Use xstring.divide to break the
  -- path into two pieces if desired. 
  function query.segments(path)
    return nextseg, path, 0
  end

  -- this particular validation is more designed for files than
  -- anything else... basically, if it starts with a . or has a 
  -- / or nil in it, we toss it.
  function query.fileSegmentOK(seg)
    return strsub(seg, 1, 1) ~= "." and not strfind(seg, "[%z/]") and seg
  end
  
  return query
end

Produced by TNT, the Lua-linter. TNT/0.5 Copyright (C) 2004 Rici Lake