-- Miscellaneous HTML/HTTP Utilities -- copyright 2004 by Rici Lake. Permission is granted to use this code under -- the same terms and conditions as found in the Lua copyright notice at -- http://www.lua.org/license.html. -- -- I'd appreciate a credit if you use this code, but I don't insist. -- -- This is prerelease software, no interfaces are guaranteed. -- Use at your own risk, etc. --TODO -- This really needs to get split into several libraries. -- I'm just not sure what should go where. -- -- There should be a better way to handle cgiGetQuery which is compatible -- with the getQuery you would use if it were not a CGI environment. -- -- Write test units for the unicode stuff and see if it actually works local string = import "string" local strfind, gfind, gsub, strchar, strbyte, strsub, strlen = string.find, string.gfind, string.gsub, string.char, string.byte, string.sub, string.len local table = import "table" local tinsert = table.insert -- there really ought to be a wrapper for modf as well as fmod local math = import "math" local mod = math.mod local util = import "libutil" local tableFrom = util.tableFrom local entity = tableFrom [[ & => & < => < > => >]] local unentity = tableFrom [[ amp => & lt => < gt => > quot => "]] local function X(str) return tonumber(str, 16) end local lastUtf8 = X"10FFFF" local firstSurrogate = X"D800" local lastSurrogate = X"DFFF" local elevenBits = X"7FF" local sixteenBits = X"FFFF" local offset2 = X"C0" * 64 + X"80" local offset3 = X"E0" * 4096 + X"80" * (64 + 1) local offset4 = X"F0" * 262144 + X"80" * (4096 + 64 + 1) return function(query) ------ HTML headers and so on --- function query.xmldecl(enc) return [[<?xml version="1.0" encoding="]] .. (enc or "ISO-8859-1") .. [["?>]] end --[[ XHTML 1.1 documents should an XML declaration. The default is UTF-8 or UTF-16. They also must "designate the XHTML namespace": --]] query.xhtmlnamespace = [[xmlns="http://www.w3.org/1999/xhtml"]] --[[ Undoubtedly someone understands this better than I do. I have been too intimidated to actually use XHTML 1.1 up to now. But allegedly this will work --]] query.xhtml11 = [[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> ]] --[[ XHTML 1.0 DTDs are the same as HTML 4.01 DTDs "except for changes due to the differences between XML and SGML". But the identifiers seem to be more consistent, too. In theory, these also ought to have a xmlns declaration in the <html> element. --]] query.xhtml10strict = [[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> ]] query.xhtml10transitional = [[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> ]] query.xhtml10frameset = [[<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"> ]] --[[ "This is HTML 4.01 Strict DTD, which excludes the presentation attributes and elements that W3C expects to phase out as support for style sheets matures. Authors should use the Strict DTD when possible, but may use the Transitional DTD when support for presentation attribute and elements is required." --]] query.html401strict = [[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> ]] --[[ "This is the HTML 4.01 Transitional DTD, which includes presentation attributes and elements that W3C expects to phase out as support for style sheets matures. Authors should use the Strict DTD when possible, but may use the Transitional DTD when support for presentation attribute and elements is required." --]] query.html401transitional = [[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> ]] --[[ "This is the HTML 4.01 Frameset DTD, which should be used for documents with frames. This DTD is identical to the HTML 4.01 Transitional DTD except for the content model of the "HTML" element: in frameset documents, the "FRAMESET" element replaces the "BODY" element." --]] query.html401frameset = [[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd"> ]] ------ QUOTERS AND DEQUOTERS ----- -- Why are there so many quoting formats, I want to know???? -- TODO: rewrite all of these in C. -- Change pluses to spaces (queries) -- I wanted to call this "nonplus" (The query was nonplussed) but -- "un" is more consistent. local function unplus(s) return (gsub(s, "+", " ")) end query.unplus = unplus -- Change %xx to octet (urls) local function unpercent(s) return (gsub(s, "%%(.?.?)", function(x) assert(strfind(x, "^%x%x$"), "400 Bad Request, invalid %-escape in URL") return strchar(tonumber(x, 16)) end)) end query.unpercent = unpercent -- Both of the above (queries) -- Note: You cannot use unescape if the query has no = in it. local function unescape(s) return unpercent(unplus(s)) end query.unescape = unescape ------ UNICODE STUFF ----- -- Convert an integer to a UTF-8 sequence, without checking for -- invalid codes. -- This originally had calls to floor scattered about but it is -- not necessary: string.char does a "C" conversion from float to int, -- which is a truncate towards zero operation; i must be non-negative, -- so that is the same as floor. local function toUtf8(i) if i <= 127 then return strchar(i) elseif i <= elevenBits then return strchar(i / 64 + 192, mod(i, 64) + 128) elseif i <= sixteenBits then return strchar(i / 4096 + 224, mod(i / 64, 64) + 128, mod(i, 64) + 128) else return strchar(i / 262144 + 240, mod(i / 4096, 64) + 128, mod(i / 64, 64) + 128, mod(i, 64) + 128) end end query.toUtf8 = toUtf8 -- Converts a UTF-8 sequence to an integer. -- TODO: Depending on execution order, the last case could require -- 26 bits of precision, which is too much for single-precision -- floats. It should work, though, because offset 4 has only 19 -- bits of precision, and the final result has 21. -- Need to check it, though. local function fromUtf8(str) if strfind(str, "^[\1-\127%z]$") then return strbyte(str) elseif strfind(str, "^[\194-\223][\128-\191]$") then return strbyte(str, 1) * 64 + strbyte(str, 2) - offset2 elseif strfind(str, "^[\225-\236\238\239][\128-\191][\128-\191]$") or strfind(str, "^\224[\160-\191][\128-\191]$") or strfind(str, "^\237[\128-\159][\128-\191]$") then return strbyte(str, 1) * 4096 + strbyte(str, 2) * 64 + strbyte(str, 3) - offset3 elseif strfind(str, "^\240[\144-\191][\128-\191][\128-\191]$") or strfind(str, "^[\241\242\243][\128-\191][\128-\191][\128-\191]$") or strfind(str, "^\244[\128-\143][\128-\191][\128-\191]$") then return (strbyte(str, 1) * 262144 - offset4) + strbyte(str, 2) * 4096 + strbyte(str, 3) * 64 + strbyte(str, 4) end end query.fromUtf8 = fromUtf8 -- Returns its argument if it is a valid Unicode code point function query.isUnicode(i) return i >= 0 and i <= lastUtf8 and not (i >= firstSurrogate and i <= lastSurrogate) and i end local function unentify1(hash, ent) local rv if hash == "" then rv = unentity[ent] elseif strfind(ent, "^[%d]+$") then rv = toUtf8(tonumber(ent)) elseif strfind(ent, "^[xX][%x]+$") then rv = toUtf8(tonumber(strsub(ent, 2), 16)) end return rv or ("&" .. hash .. ent .. ";") end -- Partial implementation of making stuff safe for HTML. This assumes -- that no character set conversion needs to be done. function query.entify(str) return (gsub(str, "[&<>]", function(w) return entity[w] end)) end -- If entity declarations are available, this can be used to construct an -- unentity table. The standard entity tables are at -- http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent -- http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent -- http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent -- Don't use the ones from the html4 DTDs, the regex below won't work. function query.addEntity(definition) for name, val in gfind(definition, '\n<!ENTITY%s+(%w+)%s+"&#([xX]?%x+);"') do unentity[name] = unentify1(val) end end -- This unentifies to UTF-8. If you needed ISO-8859-1, you are out -- of luck. Note: we only recognize 7-bit alphanumeric entity names, -- which as far as I know covers all the W3C-defined ones. function query.unentify(str) return (gsub(str, "&(#?)([%w]+);", unentify1)) end ---- QUERY HANDLING ---- -- Try to do the right thing in a CGI environment. The captive web -- server model is somewhat different. -- Query readers always return something, although it might be "" local function queryInEnvironment() return os.getenv "QUERY_STRING" or "" end -- ignore chunked transfer encoding, hopefully no-one actually uses it. local function queryInBody() local n = tonumber(os.getenv "CONTENT_LENGTH") return n and n ~= 0 and os.getenv "CONTENT_TYPE" == "application/x-www-form-urlencoded" and io.stdin:read(n) or "" end local queryReaders = { HEAD = queryInEnvironment, GET = queryInEnvironment, POST = queryInBody } -- This will return nil if the request wasn't a recognised method. -- Perhaps this isn't flexible enough, but what the heck. function query.cgiGetQuery() local reader = queryReaders[os.getenv "REQUEST_METHOD" or ""] return reader and reader() end local function qpairs(query) local fn, state, pair = gfind(query, "[^&]+") return function() pair = fn(state, pair) if pair then local _, _, key, val = strfind(pair, "([^=]+)=?(.*)") -- the only thing that will trigger this error is "[&?]=foo" assert(key, "403 Forbidden, missing key in URL") return unescape(key), unescape(val) end end end query.qpairs = qpairs -- if you know all your keys are scalar, you could use this function query.asTable(query) local rv = {} for k, v in qpairs(query) do rv[k] = v end return rv end -- The original CGI spec says that if a query string has no = in it, -- then it is a list of escaped terms separated by +. This returns the -- list if it qualifies. Empty terms are ignored. function query.isIndex(query) if not strfind(query, "=", 1, true) then local rv = {n = 0} for word in gfind(query, "([^+]+)") do tinsert(rv, unpercent(word)) end return rv end end ---- PATH STUFF ---- -- The trouble with iterating paths is that we really need to -- split the unescaped path when we actually find what we want. local function nextseg(path, prev) local s, e, seg = strfind(path, "^/*([^/]+)", prev + 1) if e then return e, unpercent(seg) end -- this will fail when we're past the last segment, whether or not -- there was a trailing / (either no slash or no letters). The first -- time we return "", and the second time we return nil. local n = strlen(path) if prev <= n then return n + 1, "" end end -- iterates over segments in a path, returns index, segment where -- index is the end of the segment. Use xstring.divide to break the -- path into two pieces if desired. function query.segments(path) return nextseg, path, 0 end -- this particular validation is more designed for files than -- anything else... basically, if it starts with a . or has a -- / or nil in it, we toss it. function query.fileSegmentOK(seg) return strsub(seg, 1, 1) ~= "." and not strfind(seg, "[%z/]") and seg end return query end
Produced by TNT, the Lua-linter. TNT/0.5 Copyright (C) 2004 Rici Lake