summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordkf <donal.k.fellows@manchester.ac.uk>2017-07-04 14:31:08 (GMT)
committerdkf <donal.k.fellows@manchester.ac.uk>2017-07-04 14:31:08 (GMT)
commit5459743af6d0f557809020375ee706351344c0d5 (patch)
tree049832b38f9369369fc43e1f728365c42395b186
parente8f49ee58eebcc3998603d1a6051d9cb5863bdfa (diff)
downloadtcl-5459743af6d0f557809020375ee706351344c0d5.zip
tcl-5459743af6d0f557809020375ee706351344c0d5.tar.gz
tcl-5459743af6d0f557809020375ee706351344c0d5.tar.bz2
Add in the URL parsing as a general service.
-rw-r--r--library/http3/http.tcl191
1 files changed, 184 insertions, 7 deletions
diff --git a/library/http3/http.tcl b/library/http3/http.tcl
index a5c93e2..bf97267 100644
--- a/library/http3/http.tcl
+++ b/library/http3/http.tcl
@@ -12,6 +12,7 @@ namespace eval ::http {
-proxyfilter callback
-proxyhost string
-proxyport integer
+ -strict boolean
-urlencoding encoding
-useragent string
}
@@ -32,7 +33,6 @@ namespace eval ::http {
-queryblocksize integer
-querychannel channel
-queryprogress callback
- -strict boolean
-timeout integer
-type {string {^[^\s/]+/+[^\s/+]$} "MIME type"}
-validate boolean
@@ -51,7 +51,7 @@ namespace eval ::http {
oo::class create Context {
variable config
- variable strict socketmap urltypes encodings charset keepalive
+ variable socketmap urltypes encodings charset keepalive
variable connectionclass counter
constructor {} {
@@ -71,6 +71,7 @@ namespace eval ::http {
# This follows the de-facto layout of user-agent strings in
# current browsers. Safe interpreters do not have
# ::tcl_platform(os) or ::tcl_platform(osVersion).
+
if {[interp issafe]} {
set platform "Windows; U; Windows NT 10.0"
} else {
@@ -95,7 +96,6 @@ namespace eval ::http {
set encodings [string tolower [encoding names]]
set charset "iso8859-1"
set keepalive 0
- set strict 1
set counter 0
}
@@ -207,15 +207,150 @@ namespace eval ::http {
return [list $proxyhost $proxyport]
}
}
+
+ method parseURL {url} {
+ # Validate URL, determine the server host and port, and check
+ # proxy case Recognize user:pass@host URLs also, although we do
+ # not do anything with that info yet.
+
+ # URLs have basically four parts.
+ #
+ # First, before the colon, is the protocol scheme (e.g. http).
+ #
+ # Second, for HTTP-like protocols, is the authority. The authority
+ # is preceded by // and lasts up to (but not including) the
+ # following / or ? and it identifies up to four parts, of which
+ # only one, the host, is required (if an authority is present at
+ # all). All other parts of the authority (user name, password,
+ # port number) are optional.
+ #
+ # Third is the resource name, which is split into two parts at a ?
+ # The first part (from the single "/" up to "?") is the path,
+ # and the second part (from that "?" up to "#") is the
+ # query. *HOWEVER*, we do not need to separate them; we send the
+ # whole lot to the server. Both, path and query are allowed to
+ # be missing, including their delimiting character.
+ #
+ # Fourth is the fragment identifier, which is everything after the
+ # firsts "#" in the URL. The fragment identifier MUST NOT be
+ # sent to the server and indeed, we don't bother to validate it
+ # (it could be an error to pass it in here, but it's cheap to
+ # strip).
+ #
+ # An example of a URL that has all the parts:
+ #
+ # http://joe:xyzzy@www.bogus.net:8000/foo/bar.tml?q=foo#changes
+ #
+ # The "http" is the protocol, the user is "joe", the password is
+ # "xyzzy", the host is "www.bogus.net", the port is "8000", the
+ # path is "/foo/bar.tml", the query is "q=foo", and the fragment
+ # is "changes".
+ #
+ # Note that the RE actually combines the user and password parts,
+ # as recommended in RFC 3986. Indeed, that RFC states that putting
+ # passwords in URLs is a Really Bad Idea, something with which I
+ # would agree utterly.
+ #
+ # From a validation perspective, we need to ensure that the parts
+ # of the URL that are going to the server are correctly encoded.
+ # This is only done if $config(-strict) is true.
+
+ set URLmatcher {(?x) # this is _expanded_ syntax
+ ^
+ (?: (\w+) : ) ? # <protocol scheme>
+ (?: //
+ (?:
+ (
+ [^@/\#?]+ # <userinfo part of authority>
+ ) @
+ )?
+ ( # <host part of authority>
+ [^/:\#?]+ | # host name or IPv4 address
+ \[ [^/\#?]+ \] # IPv6 address in square brackets
+ )
+ (?: : (\d+) )? # <port part of authority>
+ )?
+ ( [/\?] [^\#]*)? # <path> (including query)
+ (?: \# (.*) )? # <fragment>
+ $
+ }
+
+ # Phase one: parse
+ if {![regexp -- $URLmatcher $url -> \
+ proto user host port srvurl fragment]} {
+ return -code error "unsupported URL: $url"
+ }
+ # Phase two: validate
+ set host [string trim $host {[]}]; # strip square brackets from IPv6 address
+ if {$host eq ""} {
+ # Caller has to provide a host name; we do not have a "default
+ # host" that would enable us to handle relative URLs.
+ return -code error "Missing host part: $url"
+ # Note that we don't check the hostname for validity here; if
+ # it's invalid, we'll simply fail to resolve it later on.
+ }
+ if {$port ne "" && $port > 65535} {
+ return -code error "invalid port number: $port"
+ }
+ # The user identification and resource identification parts of the
+ # URL can have encoded characters in them; take care!
+ if {$user ne ""} {
+ # Check for validity according to RFC 3986, Appendix A
+ set validityRE {(?xi)
+ ^
+ (?: [-\w.~!$&'()*+,;=:] | %[0-9a-f][0-9a-f] )+
+ $
+ }
+ if {$config(-strict) && ![regexp -- $validityRE $user]} {
+ # Provide a better error message in this error case
+ if {[regexp {(?i)%(?![0-9a-f][0-9a-f]).?.?} $user bad]} {
+ return -code error \
+ "illegal encoding character usage \"$bad\" in URL user"
+ }
+ return -code error "illegal characters in URL user"
+ }
+ }
+ if {$srvurl ne ""} {
+ # RFC 3986 allows empty paths (not even a /), but servers
+ # return 400 if the path in the HTTP request doesn't start
+ # with / , so add it here if needed.
+ if {[string index $srvurl 0] ne "/"} {
+ set srvurl /$srvurl
+ }
+ # Check for validity according to RFC 3986, Appendix A
+ set validityRE {(?xi)
+ ^
+ # Path part (already must start with / character)
+ (?: [-\w.~!$&'()*+,;=:@/] | %[0-9a-f][0-9a-f] )*
+ # Query part (optional, permits ? characters)
+ (?: \? (?: [-\w.~!$&'()*+,;=:@/?] | %[0-9a-f][0-9a-f] )* )?
+ $
+ }
+ if {$config(-strict) && ![regexp -- $validityRE $srvurl]} {
+ # Provide a better error message in this error case
+ if {[regexp {(?i)%(?![0-9a-f][0-9a-f])..} $srvurl bad]} {
+ return -code error \
+ "illegal encoding character usage \"$bad\" in URL path"
+ }
+ return -code error "illegal characters in URL path"
+ }
+ }
+
+ return [list $proto $user $host $port $srvurl \
+ [string trimleft $fragment "#"]]
+ }
}
oo::class create Connection {
- variable cfg http
+ variable cfg urlTypes http
variable binary state meta coding currentsize totalsize querylength
variable queryoffset type body status httpline connection charset
+ variable theURL
+
constructor {context url defaults options} {
interp alias {} [namespace current]::Context {} $context
- my eval upvar 0 [info object namespace $context]::config http
+ set ns [info object namespace $context]
+ my eval upvar 0 ${ns}::config http ${ns}::urlTypes urlTypes
foreach {opt value} $defaults {
set cfg($opt) $value
}
@@ -240,6 +375,48 @@ namespace eval ::http {
set httpline ""
set connection close
set charset $http(-charset)
+
+ if {[info exists cfg(-querychannel)]&&[info exists cfg(-query)]} {
+ return -code error \
+ "can't use -query and -querychannel options together"
+ }
+
+ lassign [Context parseURL $url] proto user host port srvurl
+ if {$srvurl eq ""} {
+ set srvurl "/"
+ }
+ if {$proto eq ""} {
+ set proto "http"
+ }
+ set lower [string tolower $proto]
+ if {![info exists urlTypes($lower)]} {
+ return -code error "unsupported URL type \"$proto\""
+ }
+ lassign $urlTypes($lower) defport defcmd
+ if {$port eq ""} {
+ set port $defport
+ }
+
+ # Check for the proxy's opinion
+ catch {
+ if {[llength $http(-proxyfilter)]} {
+ lassign [{*}$http(-proxyfilter) $host] phost pport
+ }
+ }
+
+ # OK, now reassemble into a full URL
+ set url ${proto}://
+ if {$user ne ""} {
+ append url $user
+ append url @
+ }
+ append url $host
+ if {$port != $defport} {
+ append url : $port
+ }
+ append url $srvurl
+ # Don't append the fragment!
+ set theURL $url
}
destructor {
@@ -361,9 +538,9 @@ namespace eval ::http {
}
proc Validate(callback) {option value} {
- if {![string is list $value] || [llength $value] == 0} {
+ if {![string is list $value]} {
return -code error \
- "bad value for $option ($value), must be non-empty callback"
+ "bad value for $option ($value), must be command prefix"
}
}