diff options
author | dkf <donal.k.fellows@manchester.ac.uk> | 2017-07-04 14:31:08 (GMT) |
---|---|---|
committer | dkf <donal.k.fellows@manchester.ac.uk> | 2017-07-04 14:31:08 (GMT) |
commit | 5459743af6d0f557809020375ee706351344c0d5 (patch) | |
tree | 049832b38f9369369fc43e1f728365c42395b186 | |
parent | e8f49ee58eebcc3998603d1a6051d9cb5863bdfa (diff) | |
download | tcl-5459743af6d0f557809020375ee706351344c0d5.zip tcl-5459743af6d0f557809020375ee706351344c0d5.tar.gz tcl-5459743af6d0f557809020375ee706351344c0d5.tar.bz2 |
Add in the URL parsing as a general service.
-rw-r--r-- | library/http3/http.tcl | 191 |
1 files changed, 184 insertions, 7 deletions
diff --git a/library/http3/http.tcl b/library/http3/http.tcl index a5c93e2..bf97267 100644 --- a/library/http3/http.tcl +++ b/library/http3/http.tcl @@ -12,6 +12,7 @@ namespace eval ::http { -proxyfilter callback -proxyhost string -proxyport integer + -strict boolean -urlencoding encoding -useragent string } @@ -32,7 +33,6 @@ namespace eval ::http { -queryblocksize integer -querychannel channel -queryprogress callback - -strict boolean -timeout integer -type {string {^[^\s/]+/+[^\s/+]$} "MIME type"} -validate boolean @@ -51,7 +51,7 @@ namespace eval ::http { oo::class create Context { variable config - variable strict socketmap urltypes encodings charset keepalive + variable socketmap urltypes encodings charset keepalive variable connectionclass counter constructor {} { @@ -71,6 +71,7 @@ namespace eval ::http { # This follows the de-facto layout of user-agent strings in # current browsers. Safe interpreters do not have # ::tcl_platform(os) or ::tcl_platform(osVersion). + if {[interp issafe]} { set platform "Windows; U; Windows NT 10.0" } else { @@ -95,7 +96,6 @@ namespace eval ::http { set encodings [string tolower [encoding names]] set charset "iso8859-1" set keepalive 0 - set strict 1 set counter 0 } @@ -207,15 +207,150 @@ namespace eval ::http { return [list $proxyhost $proxyport] } } + + method parseURL {url} { + # Validate URL, determine the server host and port, and check + # proxy case Recognize user:pass@host URLs also, although we do + # not do anything with that info yet. + + # URLs have basically four parts. + # + # First, before the colon, is the protocol scheme (e.g. http). + # + # Second, for HTTP-like protocols, is the authority. The authority + # is preceded by // and lasts up to (but not including) the + # following / or ? and it identifies up to four parts, of which + # only one, the host, is required (if an authority is present at + # all). All other parts of the authority (user name, password, + # port number) are optional. + # + # Third is the resource name, which is split into two parts at a ? + # The first part (from the single "/" up to "?") is the path, + # and the second part (from that "?" up to "#") is the + # query. *HOWEVER*, we do not need to separate them; we send the + # whole lot to the server. Both, path and query are allowed to + # be missing, including their delimiting character. + # + # Fourth is the fragment identifier, which is everything after the + # firsts "#" in the URL. The fragment identifier MUST NOT be + # sent to the server and indeed, we don't bother to validate it + # (it could be an error to pass it in here, but it's cheap to + # strip). + # + # An example of a URL that has all the parts: + # + # http://joe:xyzzy@www.bogus.net:8000/foo/bar.tml?q=foo#changes + # + # The "http" is the protocol, the user is "joe", the password is + # "xyzzy", the host is "www.bogus.net", the port is "8000", the + # path is "/foo/bar.tml", the query is "q=foo", and the fragment + # is "changes". + # + # Note that the RE actually combines the user and password parts, + # as recommended in RFC 3986. Indeed, that RFC states that putting + # passwords in URLs is a Really Bad Idea, something with which I + # would agree utterly. + # + # From a validation perspective, we need to ensure that the parts + # of the URL that are going to the server are correctly encoded. + # This is only done if $config(-strict) is true. + + set URLmatcher {(?x) # this is _expanded_ syntax + ^ + (?: (\w+) : ) ? # <protocol scheme> + (?: // + (?: + ( + [^@/\#?]+ # <userinfo part of authority> + ) @ + )? + ( # <host part of authority> + [^/:\#?]+ | # host name or IPv4 address + \[ [^/\#?]+ \] # IPv6 address in square brackets + ) + (?: : (\d+) )? # <port part of authority> + )? + ( [/\?] [^\#]*)? # <path> (including query) + (?: \# (.*) )? # <fragment> + $ + } + + # Phase one: parse + if {![regexp -- $URLmatcher $url -> \ + proto user host port srvurl fragment]} { + return -code error "unsupported URL: $url" + } + # Phase two: validate + set host [string trim $host {[]}]; # strip square brackets from IPv6 address + if {$host eq ""} { + # Caller has to provide a host name; we do not have a "default + # host" that would enable us to handle relative URLs. + return -code error "Missing host part: $url" + # Note that we don't check the hostname for validity here; if + # it's invalid, we'll simply fail to resolve it later on. + } + if {$port ne "" && $port > 65535} { + return -code error "invalid port number: $port" + } + # The user identification and resource identification parts of the + # URL can have encoded characters in them; take care! + if {$user ne ""} { + # Check for validity according to RFC 3986, Appendix A + set validityRE {(?xi) + ^ + (?: [-\w.~!$&'()*+,;=:] | %[0-9a-f][0-9a-f] )+ + $ + } + if {$config(-strict) && ![regexp -- $validityRE $user]} { + # Provide a better error message in this error case + if {[regexp {(?i)%(?![0-9a-f][0-9a-f]).?.?} $user bad]} { + return -code error \ + "illegal encoding character usage \"$bad\" in URL user" + } + return -code error "illegal characters in URL user" + } + } + if {$srvurl ne ""} { + # RFC 3986 allows empty paths (not even a /), but servers + # return 400 if the path in the HTTP request doesn't start + # with / , so add it here if needed. + if {[string index $srvurl 0] ne "/"} { + set srvurl /$srvurl + } + # Check for validity according to RFC 3986, Appendix A + set validityRE {(?xi) + ^ + # Path part (already must start with / character) + (?: [-\w.~!$&'()*+,;=:@/] | %[0-9a-f][0-9a-f] )* + # Query part (optional, permits ? characters) + (?: \? (?: [-\w.~!$&'()*+,;=:@/?] | %[0-9a-f][0-9a-f] )* )? + $ + } + if {$config(-strict) && ![regexp -- $validityRE $srvurl]} { + # Provide a better error message in this error case + if {[regexp {(?i)%(?![0-9a-f][0-9a-f])..} $srvurl bad]} { + return -code error \ + "illegal encoding character usage \"$bad\" in URL path" + } + return -code error "illegal characters in URL path" + } + } + + return [list $proto $user $host $port $srvurl \ + [string trimleft $fragment "#"]] + } } oo::class create Connection { - variable cfg http + variable cfg urlTypes http variable binary state meta coding currentsize totalsize querylength variable queryoffset type body status httpline connection charset + variable theURL + constructor {context url defaults options} { interp alias {} [namespace current]::Context {} $context - my eval upvar 0 [info object namespace $context]::config http + set ns [info object namespace $context] + my eval upvar 0 ${ns}::config http ${ns}::urlTypes urlTypes foreach {opt value} $defaults { set cfg($opt) $value } @@ -240,6 +375,48 @@ namespace eval ::http { set httpline "" set connection close set charset $http(-charset) + + if {[info exists cfg(-querychannel)]&&[info exists cfg(-query)]} { + return -code error \ + "can't use -query and -querychannel options together" + } + + lassign [Context parseURL $url] proto user host port srvurl + if {$srvurl eq ""} { + set srvurl "/" + } + if {$proto eq ""} { + set proto "http" + } + set lower [string tolower $proto] + if {![info exists urlTypes($lower)]} { + return -code error "unsupported URL type \"$proto\"" + } + lassign $urlTypes($lower) defport defcmd + if {$port eq ""} { + set port $defport + } + + # Check for the proxy's opinion + catch { + if {[llength $http(-proxyfilter)]} { + lassign [{*}$http(-proxyfilter) $host] phost pport + } + } + + # OK, now reassemble into a full URL + set url ${proto}:// + if {$user ne ""} { + append url $user + append url @ + } + append url $host + if {$port != $defport} { + append url : $port + } + append url $srvurl + # Don't append the fragment! + set theURL $url } destructor { @@ -361,9 +538,9 @@ namespace eval ::http { } proc Validate(callback) {option value} { - if {![string is list $value] || [llength $value] == 0} { + if {![string is list $value]} { return -code error \ - "bad value for $option ($value), must be non-empty callback" + "bad value for $option ($value), must be command prefix" } } |