summaryrefslogtreecommitdiffstats
path: root/tcllib/examples/htmlparse/webviewer.tcl
blob: f7394f3e4acb4a5b098ace25fee53385d886c119 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env tclsh
## -*- tcl -*-
# webviewer.tcl - Copyright (C) 2004 Pat Thoyts <patthoyts@users.sourceforge.net>
#
# This is a sample application to demonstrate the use of the htmlparse package.
#
# Given the URL of a web page, this application will display just the text of
# the page - that is the contents of header, paragraph and pre tags.
#
# As an aside, this also illustrates the use of the autoproxy package to 
# cope with http proxy servers (if present) and handles HTTP redirections and
# so on.
#
# Usage: webviewer.tcl http://tip.tcl.tk/2
#
# $Id: webviewer.tcl,v 1.2 2009/01/30 04:18:14 andreas_kupries Exp $

package require htmlparse;              # tcllib
package require http;                   # tcl
package require autoproxy;              # tcllib
autoproxy::init

# -------------------------------------------------------------------------
# The driver.
# - Fetch the page
# - parse it to extract the text
# - sort out the html escaped chars
# - eliminate excessive newlines
#
proc webview {url} {
    set html [fetchurl $url]
    if {[string length $html] > 0} {
        variable parsed ""
        htmlparse::parse -cmd [list parser [namespace current]::parsed] $html
        set parsed [htmlparse::mapEscapes $parsed]
        set parsed [regsub -all -line "\n{2,}" $parsed "\n\n"]
        Display $parsed
    } else {
        Error "error: no data available from \"$url\""
    }
}

# -------------------------------------------------------------------------
# This implements our text extracting parser. This will pretty much turn 
# an HTML page into an outline-mode text file.
#
proc parser {outvar tag end attr text} {
    upvar \#0 $outvar out
    set tag [string tolower $tag]
    set end [string length $end]
    if {$end == 0} {
        if {[string equal "hmstart" $tag]} {
            set out ""
        } elseif {[regexp {h(\d+)} $tag -> level]} {
            append out "\n\n" [string repeat * $level] " " $text
        } elseif {[lsearch -exact {p pre td} $tag] != -1} {
            append out "\n" $text
        } elseif {[lsearch -exact {a span i b} $tag] != -1} { 
            append out $text
        }
    }
}

# -------------------------------------------------------------------------
# Fetch the target page and cope with HTTP problems. This
# deals with server errors and proxy authentication failure
# and handles HTTP redirection.
#
proc fetchurl {url} {
    set html ""
    set err ""
    set tok [http::geturl $url -timeout 30000]
    if {[string equal [http::status $tok] "ok"]} {
        if {[http::ncode $tok] >= 500} {
            set err "server error: [http::code $tok]"
        } elseif {[http::ncode $tok] >= 400} {
            set err "authentication error: [http::code $tok]"
        } elseif {[http::ncode $tok] >= 300} {
            upvar \#0 $tok state
            array set meta $state(meta)
            if {[info exists meta(Location)]} {
                return [fetchurl $meta(Location)]
            } else {
                set err [http::code $tok]
            }
        } else {
            set html [http::data $tok]
        }
    } else {
        set err [http::error $tok]
    }
    http::cleanup $tok

    if {[string length $err] > 0} {
        Error $err
    }
    return $html
}

# -------------------------------------------------------------------------
# Abstract out the display functions so we can run this using either wish or
# tclsh. This makes life easier on windows where the default is to use wish
# for tcl files.
#
proc Display {msg} {
    if {[string length [package provide Tk]] > 0} {
        toplevel .dlg -class Dialog
        wm title .dlg "webview output."
        text .dlg.txt -yscrollcommand {.dlg.sb set}
        scrollbar .dlg.sb -command {.dlg.txt yview}
        button .dlg.b -command {destroy .dlg} -text Exit -underline 1
        .dlg.txt insert 0.0 $msg
        bind .dlg <Control-F2> {console show}
        bind .dlg <Escape> {.dlg.b invoke}
        grid .dlg.txt .dlg.sb -sticky news
        grid .dlg.b  - -sticky e -pady {3 0} -ipadx 4
        grid rowconfigure .dlg 0 -weight 1
        grid columnconfigure .dlg 0 -weight 1
        tkwait window .dlg
    } else {
        puts $msg
    }
}

proc Error {msg} {
    if {[string length [package provide Tk]] > 0} {
        tk_messageBox -title "webviewer error" -icon error -message $msg
    } else {
        puts stderr $msg
    }
    exit 1
}

# -------------------------------------------------------------------------

if {!$tcl_interactive} {
    if {[string length [package provide Tk]] > 0} {
        wm withdraw .
        if {$argc < 1} {
            toplevel .dlg -class Dialog
            wm title .dlg "Enter URL"
            label .dlg.l -text "Enter a URL"
            entry .dlg.e -textvariable argv -width 40
            button .dlg.ok -text OK -default active -command {destroy .dlg}
            button .dlg.ca -text Cancel -command {set ::argv ""; destroy .dlg}
            bind .dlg <Return> {.dlg.ok invoke}
            bind .dlg <Escape> {.dlg.ca invoke}
            bind .dlg <Control-F2> {console show}
            grid .dlg.l - -sticky nws
            grid .dlg.e - -sticky news
            grid .dlg.ok .dlg.ca -sticky news
            tkwait window .dlg
            if {[llength $argv] < 1} {
                exit 1
            }
        }
    } else {
    
        if {$argc != 1} {
            Error "usage: webviewer URL"
        }

    }
    eval [linsert $argv 0 webview]
}