diff options
author | Fred Drake <fdrake@acm.org> | 1999-01-08 15:25:29 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 1999-01-08 15:25:29 (GMT) |
commit | 3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8 (patch) | |
tree | 0074eae07130a12e34af18ec6c354367f653c19e /Doc/tools/html2texi.pl | |
parent | 41498435ba1f2901e2e3498d7f7f5c773a602568 (diff) | |
download | cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.zip cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.gz cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.bz2 |
New conversion tools for HTML->info from Michael Ernst
<mernst@cs.washington.edu>.
Thanks!
Diffstat (limited to 'Doc/tools/html2texi.pl')
-rwxr-xr-x | Doc/tools/html2texi.pl | 1666 |
1 files changed, 1666 insertions, 0 deletions
diff --git a/Doc/tools/html2texi.pl b/Doc/tools/html2texi.pl new file mode 100755 index 0000000..cf3595f --- /dev/null +++ b/Doc/tools/html2texi.pl @@ -0,0 +1,1666 @@ +#! /usr/bin/env perl -w +# html2texi.pl -- Convert HTML documentation to Texinfo format +# Michael Ernst <mernst@cs.washington.edu> +# Time-stamp: <1998-09-10 12:52:38 mernst> + +# This program converts HTML documentation trees into Texinfo format. +# Given the name of a main (or contents) HTML file, it processes that file, +# and other files (transitively) referenced by it, into a Texinfo file +# (whose name is chosen from the file or directory name of the argument). +# For instance: +# html2texi.pl api/index.pl +# produces file "api.texi". + +# Texinfo format can be easily converted to Info format (for browsing in +# Emacs or the standalone Info browser), to a printed manual, or to HTML. +# Thus, html2texi.pl permits conversion of HTML files to Info format, and +# secondarily enables producing printed versions of Web page hierarchies. + +# Unlike HTML, Info format is searchable. Since Info is integrated into +# Emacs, one can read documentation without starting a separate Web +# browser. Additionally, Info browsers (including Emacs) contain +# convenient features missing from Web browsers, such as easy index lookup +# and mouse-free browsing. + +# Limitations: +# html2texi.pl is currently tuned to latex2html output, but should be +# extensible to arbitrary HTML documents. It will be most useful for HTML +# with a hierarchical structure and an index. The HTML tree to be +# traversed must be on local disk, rather than being accessed via HTTP. +# This script requires the use of "checkargs.pm". To eliminate that +# dependence, replace calls to check_args* by @_ (which is always the last +# argument to those functions). +# Also see the "to do" section, below. +# Comments, suggestions, bug fixes, and enhancements are welcome. + +### +### Typical usage for the Python documentation: +### + +# (Actually, most of this is in a Makefile instead.) +# The resulting Info format Python documentation is currently available at +# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz + +# Fix up HTML problems, eg <DL COMPACT><DD> + +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html +# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html + +# Edit the generated .texi files: +# * change @setfilename to prefix "python-" +# * fix up any sectioning, such as for Abstract +# * make Texinfo menus +# * perhaps remove the @detailmenu ... @end detailmenu +# In Emacs: +# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer)) + +# makeinfo api.texi +# makeinfo ext.texi +# makeinfo lib.texi +# makeinfo mac.texi +# makeinfo ref.texi +# makeinfo tut.texi + + +### +### Structure of the code +### + +# To be written... + + +### +### Design decisions +### + +# Source and destination languages +# -------------------------------- +# +# The goal is Info files; I create Texinfo, so I don't have to worry about +# the finer details of Info file creation. (I'm not even sure of its exact +# format.) +# +# Why not start from LaTeX rather than HTML? +# I could hack latex2html itself to produce Texinfo instead, or fix up +# partparse.py (which already translates LaTeX to Teinfo). +# Pros: +# * has high-level information such as index entries, original formatting +# Cons: +# * those programs are complicated to read and understand +# * those programs try to handle arbitrary LaTeX input, track catcodes, +# and more: I don't want to go to that effort. HTML isn't as powerful +# as LaTeX, so there are fewer subtleties. +# * the result wouldn't work for arbitrary HTML documents; it would be +# nice to eventually extend this program to HTML produced from Docbook, +# Frame, and more. + +# Parsing +# ------- +# +# I don't want to view the text as a linear stream; I'd rather parse the +# whole thing and then do pattern matching over the parsed representation (to +# find idioms such as indices, lists of child nodes, etc.). +# * Perl provides HTML::TreeBuilder, which does just what I want. +# * libwww-perl: http://www.linpro.no/lwp/ +# * TreeBuilder: HTML-Tree-0.51.tar.gz +# * Python Parsers, Formatters, and Writers don't really provide the right +# interface (and the version in Grail doesn't correspond to another +# distributed version, so I'm confused about which to be using). I could +# write something in Python that creates a parse tree, but why bother? + +# Other implementation language issues: +# * Python lacks variable declarations, reasonable scoping, and static +# checking tools. I've written some of the latter for myself that make +# my Perl programming a lot safer than my Python programming will be until +# I have a similar suite for that language. + + +########################################################################### +### To do +### + +# Section names: +# Fix the problem with multiple sections in a single file (eg, Abstract in +# Front Matter section). +# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310 +# Index: +# Perhaps double-check that every tag mentioned in the index is found +# in the text. +# Python: email to python-docs@python.org, to get their feedback. +# Compare to existing lib/ Info manual +# Write the hooks into info-look; replace pyliblookup1-1.tar.gz. +# Postpass to remove extra quotation marks around typography already in +# a different font (to avoid double delimiters as in "`code'"); or +# perhaps consider using only font-based markup so that we don't get +# the extra *bold* and `code' markup in Info. + +## Perhaps don't rely on automatic means for adding up, next, prev; I have +## all that info available to me already, so it's not so much trouble to +## add it. (Right?) But it is *so* easy to use Emacs instead... + + +########################################################################### +### Strictures +### + +# man HTML::TreeBuilder +# man HTML::Parser +# man HTML::Element + +# require HTML::ParserWComment; +require HTML::Parser; +require HTML::TreeBuilder; +require HTML::Element; + +use File::Basename; +use Cwd; + +use strict; +# use Carp; + + +use checkargs; + + +########################################################################### +### Variables +### + +my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think) +my $current_ref_tdf; # for the file currently being processed; + # used in error messages +my $html_directory; +my %footnotes; + +# First element should not be used. +my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection"); + +my %inline_markup = ("b" => "strong", + "code" => "code", + "i" => "emph", + "kbd" => "kbd", + "samp" => "samp", + "strong" => "strong", + "tt" => "code", + "var" => "var"); + +my @deferred_index_entries = (); + +my @index_titles = (); # list of (filename, type) lists +my %index_info = ("Index" => ["\@blindex", "bl"], + "Concept Index" => ["\@cindex", "cp"], + "Module Index" => ["\@mdindex", "md"]); + + +########################################################################### +### Main/contents page +### + +# Process first-level page on its own, or just a contents page? Well, I do +# want the title, author, etc., and the front matter... For now, just add +# that by hand at the end. + + +# data structure possibilities: +# * tree-like (need some kind of stack when processing (or parent pointers)) +# * list of name and depth; remember old and new depths. + +# Each element is a reference to a list of (nodetitle, depth, filename). +my @contents_list = (); + +# The problem with doing fixups on the fly is that some sections may have +# already been processed (and no longer available) by the time we notice +# others with the same name. It's probably better to fully construct the +# contents list (reading in all files of interest) upfront; that will also +# let me do a better job with cross-references, because again, all files +# will already be read in. +my %contents_hash = (); +my %contents_fixups = (); + +my @current_contents_list = (); + +# Merge @current_contents_list into @contents_list, +# and set @current_contents_list to be empty. +sub merge_contents_lists ( ) +{ check_args(0, @_); + + # Three possibilities: + # * @contents_list is empty: replace it by @current_contents_list. + # * prefixes of the two lists are identical: do nothing + # * @current_contents_list is all at lower level than $contents_list[0]; + # prefix @contents_list by @current_contents_list + + if (scalar(@current_contents_list) == 0) + { die "empty current_contents_list"; } + + # if (scalar(@contents_list) == 0) + # { @contents_list = @current_contents_list; + # @current_contents_list = (); + # return; } + + # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1])) + # { unshift @contents_list, @current_contents_list; + # @current_contents_list = (); + # return; } + + for (my $i=0; $i<scalar(@current_contents_list); $i++) + { my $ref_c_tdf = $current_contents_list[$i]; + if ($i >= scalar(@contents_list)) + { push @contents_list, $ref_c_tdf; + my $title = $ {$ref_c_tdf}[0]; + if (defined $contents_hash{$title}) + { $contents_fixups{$title} = 1; } + else + { $contents_hash{$title} = 1; } + next; } + my $ref_tdf = $contents_list[$i]; + my ($title, $depth, $file) = @{$ref_tdf}; + my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf}; + + if (($title ne $c_title) + && ($depth < $c_depth) + && ($file ne $c_file)) + { splice @contents_list, $i, 0, $ref_c_tdf; + if (defined $contents_hash{$c_title}) + { $contents_fixups{$c_title} = 1; } + else + { $contents_hash{$c_title} = 1; } + next; } + + if (($title ne $c_title) + || ($depth != $c_depth) + || ($file ne $c_file)) + { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:", + "\n main: <<<$title>>> $depth $file", + "\n curr: <<<$c_title>>> $c_depth $c_file"); } + } + @current_contents_list = (); +} + + + +# Set @current_contents_list to a list of (title, href, sectionlevel); +# then merge that list into @contents_list. +# Maybe this function should also produce a map +# from title (or href) to sectionlevel (eg "chapter"?). +sub process_child_links ( $ ) +{ my ($he) = check_args(1, @_); + + # $he->dump; + if (scalar(@current_contents_list) != 0) + { die "current_contents_list nonempty: @current_contents_list"; } + $he->traverse(\&increment_current_contents_list, 'ignore text'); + + # Normalize the depths; for instance, convert 1,3,5 into 0,1,2. + my %depths = (); + for my $ref_tdf (@current_contents_list) + { $depths{$ {$ref_tdf}[1]} = 1; } + my @sorted_depths = sort keys %depths; + my $current_depth = scalar(@section_stack)-1; + my $current_depth_2 = $ {$current_ref_tdf}[1]; + if ($current_depth != $current_depth_2) + { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); } + for (my $i=0; $i<scalar(@sorted_depths); $i++) + { $depths{$sorted_depths[$i]} = $i + $current_depth+1; } + for my $ref_tdf (@current_contents_list) + { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; } + + # Eliminate uninteresting sections. Hard-coded hack for now. + if ($ {$current_contents_list[-1]}[0] eq "About this document ...") + { pop @current_contents_list; } + if ((scalar(@current_contents_list) > 1) + && ($ {$current_contents_list[1]}[0] eq "Contents")) + { my $ref_first_tdf = shift @current_contents_list; + $current_contents_list[0] = $ref_first_tdf; } + + for (my $i=0; $i<scalar(@current_contents_list); $i++) + { my $ref_tdf = $current_contents_list[$i]; + my $title = $ {$ref_tdf}[0]; + if (exists $index_info{$title}) + { my $index_file = $ {$ref_tdf}[2]; + my ($indexing_command, $suffix) = @{$index_info{$title}}; + process_index_file($index_file, $indexing_command); + print TEXI "\n\@defindex $suffix\n"; + push @index_titles, $title; + splice @current_contents_list, $i, 1; + $i--; } + elsif ($title =~ /\bIndex$/) + { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } } + + merge_contents_lists(); + + # print_contents_list(); + # print_index_info(); +} + + +sub increment_current_contents_list ( $$$ ) +{ my ($he, $startflag, $depth) = check_args(3, @_); + if (!$startflag) + { return; } + + if ($he->tag eq "li") + { my @li_content = @{$he->content}; + if ($li_content[0]->tag ne "a") + { die "first element of <LI> should be <A>"; } + my ($name, $href, @content) = anchor_info($li_content[0]); + # unused $name + my $title = join("", collect_texts($li_content[0])); + $title = texi_remove_punctuation($title); + # The problem with these is that they are formatted differently in + # @menu and @node! + $title =~ s/``/\"/g; + $title =~ s/''/\"/g; + $title =~ s/ -- / /g; + push @current_contents_list, [ $title, $depth, $href ]; } + return 1; +} + +# Simple version for section titles +sub html_to_texi ( $ ) +{ my ($he) = check_args(1, @_); + if (!ref $he) + { return $he; } + + my $tag = $he->tag; + if (exists $inline_markup{$tag}) + { my $result = "\@$inline_markup{$tag}\{"; + for my $elt (@{$he->content}) + { $result .= html_to_texi($elt); } + $result .= "\}"; + return $result; } + else + { $he->dump; + die "html_to_texi confused by <$tag>"; } +} + + + +sub print_contents_list () +{ check_args(0, @_); + print STDERR "Contents list:\n"; + for my $ref_tdf (@contents_list) + { my ($title, $depth, $file) = @{$ref_tdf}; + print STDERR "$title $depth $file\n"; } +} + + + +########################################################################### +### Index +### + +my $l2h_broken_link_name = "l2h-"; + + +# map from file to (map from anchor name to (list of index texts)) +# (The list is needed when a single LaTeX command like \envvar +# expands to multiple \index commands.) +my %file_index_entries = (); +my %this_index_entries; # map from anchor name to (list of index texts) + +my %file_index_entries_broken = (); # map from file to (list of index texts) +my @this_index_entries_broken; + +my $index_prefix = ""; +my @index_prefixes = (); + +my $this_indexing_command; + +sub print_index_info () +{ check_args(0, @_); + my ($key, $val); + for my $file (sort keys %file_index_entries) + { my %index_entries = %{$file_index_entries{$file}}; + print STDERR "file: $file\n"; + for my $aname (sort keys %index_entries) + { my @entries = @{$index_entries{$aname}}; + if (scalar(@entries) == 1) + { print STDERR " $aname : $entries[0]\n"; } + else + { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } } + for my $file (sort keys %file_index_entries_broken) + { my @entries = @{$file_index_entries_broken{$file}}; + print STDERR "file: $file\n"; + for my $entry (@entries) + { print STDERR " $entry\n"; } + } +} + + +sub process_index_file ( $$ ) +{ my ($file, $indexing_command) = check_args(2, @_); + # print "process_index_file $file $indexing_command\n"; + + my $he = file_to_tree($html_directory . $file); + # $he->dump(); + + $this_indexing_command = $indexing_command; + $he->traverse(\&process_if_index_dl_compact, 'ignore text'); + undef $this_indexing_command; + # print "process_index_file done\n"; +} + + +sub process_if_index_dl_compact ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + if (($he->tag() eq "dl") && (defined $he->attr('compact'))) + { process_index_dl_compact($he); + return 0; } + else + { return 1; } +} + + +# The elements of a <DL COMPACT> list from a LaTeX2HTML index: +# * a single space: text to be ignored +# * <DT> elements with an optional <DD> element following each one +# Two types of <DT> elements: +# * Followed by a <DD> element: the <DT> contains a single +# string, and the <DD> contains a whitespace string to be ignored, a +# <DL COMPACT> to be recursively processed (with the <DT> string as a +# prefix), and a whitespace string to be ignored. +# * Not followed by a <DD> element: contains a list of anchors +# and texts (ignore the texts, which are only whitespace and commas). +# Optionally contains a <DL COMPACT> to be recursively processed (with +# the <DT> string as a prefix) +sub process_index_dl_compact ( $ ) +{ my ($h) = check_args(1, @_); + my @content = @{$h->content()}; + for (my $i = 0; $i < scalar(@content); $i++) + { my $this_he = $content[$i]; + if ($this_he->tag ne "dt") + { $this_he->dump; + die "Expected <DT> tag: " . $this_he->tag; } + if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd")) + { process_index_dt_and_dd($this_he, $content[$i+1]); + $i++; } + else + { process_index_lone_dt($this_he); } } } + + + +# Argument is a <DT> element. If it contains more than one anchor, then +# the texts of all subsequent ones are "[Link]". Example: +# <DT> +# <A HREF="embedding.html#l2h-201"> +# "$PATH" +# ", " +# <A HREF="embedding.html#l2h-205"> +# "[Link]" +# Optionally contains a <DL COMPACT> as well. Example: +# <DT> +# <A HREF="types.html#l2h-616"> +# "attribute" +# <DL COMPACT> +# <DT> +# <A HREF="assignment.html#l2h-3074"> +# "assignment" +# ", " +# <A HREF="assignment.html#l2h-3099"> +# "[Link]" +# <DT> +# <A HREF="types.html#l2h-"> +# "assignment, class" + +sub process_index_lone_dt ( $ ) +{ my ($dt) = check_args(1, @_); + my @dtcontent = @{$dt->content()}; + my $acontent; + my $acontent_suffix; + for my $a (@dtcontent) + { if ($a eq ", ") + { next; } + if (!ref $a) + { $dt->dump; + die "Unexpected <DT> string element: $a"; } + + if ($a->tag eq "dl") + { push @index_prefixes, $index_prefix; + if (!defined $acontent_suffix) + { die "acontent_suffix not yet defined"; } + $index_prefix .= $acontent_suffix . ", "; + process_index_dl_compact($a); + $index_prefix = pop(@index_prefixes); + return; } + + if ($a->tag ne "a") + { $dt->dump; + $a->dump; + die "Expected anchor in lone <DT>"; } + + my ($aname, $ahref, @acontent) = anchor_info($a); + # unused $aname + if (scalar(@acontent) != 1) + { die "Expected just one content of <A> in <DT>: @acontent"; } + if (ref $acontent[0]) + { $acontent[0]->dump; + die "Expected string content of <A> in <DT>: $acontent[0]"; } + if (!defined($acontent)) + { $acontent = $index_prefix . $acontent[0]; + $acontent_suffix = $acontent[0]; } + elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0]))) + { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; } + + if (!defined $ahref) + { $dt->dump; + die "no HREF in nachor in <DT>"; } + my ($ahref_file, $ahref_name) = split(/\#/, $ahref); + if (!defined $ahref_name) + { # Reference to entire file + $ahref_name = ""; } + + if ($ahref_name eq $l2h_broken_link_name) + { if (!exists $file_index_entries_broken{$ahref_file}) + { $file_index_entries_broken{$ahref_file} = []; } + push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent"; + next; } + + if (!exists $file_index_entries{$ahref_file}) + { $file_index_entries{$ahref_file} = {}; } + # Don't do this! It appears to make a copy, which is not desired. + # my %index_entries = %{$file_index_entries{$ahref_file}}; + if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name}) + { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; } + # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name}; + # if ($acontent eq $oldcontent) + # { die "Multiple identical index entries?"; } + # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; } + + push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent"; + # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n"; + } +} + +sub process_index_dt_and_dd ( $$ ) +{ my ($dt, $dd) = check_args(2, @_); + my $dtcontent; + { my @dtcontent = @{$dt->content()}; + if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0])) + { $dd->dump; + $dt->dump; + die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; } + $dtcontent = $dtcontent[0]; + $dtcontent =~ s/ +$//; } + my $ddcontent; + { my @ddcontent = @{$dd->content()}; + if (scalar(@ddcontent) != 1) + { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; } + $ddcontent = $ddcontent[0]; } + if ($ddcontent->tag ne "dl") + { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; } + + push @index_prefixes, $index_prefix; + $index_prefix .= $dtcontent . ", "; + process_index_dl_compact($ddcontent); + $index_prefix = pop(@index_prefixes); +} + + +########################################################################### +### Ordinary sections +### + +sub process_section_file ( $$$ ) +{ my ($file, $depth, $nodetitle) = check_args(3, @_); + my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file); + + # print STDERR "process_section_file: $file $depth $nodetitle\n"; + + # Equivalently: + # while ($depth >= scalar(@section_stack)) { pop(@section_stack); } + @section_stack = @section_stack[0..$depth-1]; + + # Not a great nodename fixup scheme; need a more global view + if ((defined $contents_fixups{$nodetitle}) + && (scalar(@section_stack) > 0)) + { my $up_title = $section_stack[$#section_stack]; + # hack for Python Standard Library + $up_title =~ s/^(Built-in|Standard) Module //g; + my ($up_first_word) = split(/ /, $up_title); + $nodetitle = "$up_first_word $nodetitle"; + } + + push @section_stack, $nodetitle; + # print STDERR "new section_stack: ", join(", ", @section_stack), "\n"; + + $he->traverse(\&process_if_child_links, 'ignore text'); + %footnotes = (); + # $he->dump; + $he->traverse(\&process_if_footnotes, 'ignore text'); + + # $he->dump; + + if (exists $file_index_entries{$file}) + { %this_index_entries = %{$file_index_entries{$file}}; + # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n"; + } + else + { # print STDERR "Warning: no index entries for file $file\n"; + %this_index_entries = (); } + + if (exists $file_index_entries_broken{$file}) + { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; } + else + { # print STDERR "Warning: no index entries for file $file\n"; + @this_index_entries_broken = (); } + + + if ($he->tag() ne "html") + { die "Expected <HTML> at top level"; } + my @content = @{$he->content()}; + if ((!ref $content[0]) or ($content[0]->tag ne "head")) + { $he->dump; + die "<HEAD> not first element of <HTML>"; } + if ((!ref $content[1]) or ($content[1]->tag ne "body")) + { $he->dump; + die "<BODY> not second element of <HTML>"; } + + $content[1]->traverse(\&output_body); +} + +# stack of things we're inside that are preventing indexing from occurring now. +# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?) +my @index_deferrers = (); + +sub push_or_pop_index_deferrers ( $$ ) +{ my ($tag, $startflag) = check_args(2, @_); + if ($startflag) + { push @index_deferrers, $tag; } + else + { my $old_deferrer = pop @index_deferrers; + if ($tag ne $old_deferrer) + { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); } + do_deferred_index_entries(); } +} + + +sub label_add_index_entries ( $;$ ) +{ my ($label, $he) = check_args_range(1, 2, @_); + # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n"; + # $he is the anchor element + if (exists $this_index_entries{$label}) + { push @deferred_index_entries, @{$this_index_entries{$label}}; + return; } + + if ($label eq $l2h_broken_link_name) + { # Try to find some text to use in guessing which links should point here + # I should probably only look at the previous element, or if that is + # all punctuation, the one before it; collecting all the previous texts + # is a bit of overkill. + my @anchor_texts = collect_texts($he); + my @previous_texts = collect_texts($he->parent, $he); + # 4 elements is arbitrary; ought to filter out punctuation and small words + # first, then perhaps keep fewer. Perhaps also filter out formatting so + # that we can see a larger chunk of text? (Probably not.) + # Also perhaps should do further chunking into words, in case the + # index term isn't a chunk of its own (eg, was in <tt>...</tt>. + my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]); + + my $guessed = 0; + for my $text (@candidate_texts) + { # my $orig_text = $text; + if ($text =~ /^[\"\`\'().?! ]*$/) + { next; } + if (length($text) <= 2) + { next; } + # hack for Python manual; maybe defer until failure first time around? + $text =~ s/^sys\.//g; + for my $iterm (@this_index_entries_broken) + { # I could test for zero: LaTeX2HTML's failures in the Python + # documentation are only for items of the form "... (built-in...)" + if (index($iterm, $text) != -1) + { push @deferred_index_entries, $iterm; + # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n"; + $guessed = 1; + } } } + if (!$guessed) + { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n"; + } + } +} + + +# Need to add calls to this at various places. +# Perhaps add HTML::Element argument and do the check for appropriateness +# here (ie, no action if inside <H1>, etc.). +sub do_deferred_index_entries () +{ check_args(0, @_); + if ((scalar(@deferred_index_entries) > 0) + && (scalar(@index_deferrers) == 0)) + { print TEXI "\n", join("\n", @deferred_index_entries), "\n"; + @deferred_index_entries = (); } +} + +my $table_columns; # undefined if not in a table +my $table_first_column; # boolean + +sub output_body ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + + if (!ref $he) + { my $space_index = index($he, " "); + if ($space_index != -1) + { # Why does + # print TEXI texi_quote(substr($he, 0, $space_index+1)); + # give: Can't locate object method "TEXI" via package "texi_quote" + # (Because the definition texi_quote hasn't been seen yet.) + print TEXI &texi_quote(substr($he, 0, $space_index+1)); + do_deferred_index_entries(); + print TEXI &texi_quote(substr($he, $space_index+1)); } + else + { print TEXI &texi_quote($he); } + return; } + + my $tag = $he->tag(); + + # Ordinary text markup first + if (exists $inline_markup{$tag}) + { if ($startflag) + { print TEXI "\@$inline_markup{$tag}\{"; } + else + { print TEXI "\}"; } } + elsif ($tag eq "a") + { my ($name, $href, @content) = anchor_info($he); + if (!$href) + { # This anchor is only here for indexing/cross referencing purposes. + if ($startflag) + { label_add_index_entries($name, $he); } + } + elsif ($href =~ "^(ftp|http|news):") + { if ($startflag) + { # Should avoid second argument if it's identical to the URL. + print TEXI "\@uref\{$href, "; } + else + { print TEXI "\}"; } + } + elsif ($href =~ /^\#(foot[0-9]+)$/) + { # Footnote + if ($startflag) + { # Could double-check name and content, but I'm not + # currently storing that information. + print TEXI "\@footnote\{"; + $footnotes{$1}->traverse(\&output_body); + print TEXI "\}"; + return 0; } } + else + { if ($startflag) + { $he->dump; + warn "Can't deal with internal HREF anchors yet"; } + } + } + elsif ($tag eq "br") + { print TEXI "\@\n"; } + elsif ($tag eq "body") + { } + elsif ($tag eq "center") + { if (has_single_content_string($he) + && ($ {$he->content}[0] =~ /^ *$/)) + { return 0; } + if ($startflag) + { print TEXI "\n\@center\n"; } + else + { print TEXI "\n\@end center\n"; } + } + elsif ($tag eq "div") + { my $align = $he->attr('align'); + if (defined($align) && ($align eq "center")) + { if (has_single_content_string($he) + && ($ {$he->content}[0] =~ /^ *$/)) + { return 0; } + if ($startflag) + { print TEXI "\n\@center\n"; } + else + { print TEXI "\n\@end center\n"; } } + } + elsif ($tag eq "dl") + { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example" + if (has_single_content_with_tag($he, "dd")) + { my $he_dd = $ {$he->content}[0]; + if (has_single_content_with_tag($he_dd, "pre")) + { my $he_pre = $ {$he_dd->content}[0]; + print_pre($he_pre); + return 0; } } + if ($startflag) + { # Could examine the elements, to be cleverer about formatting. + # (Also to use ftable, vtable...) + print TEXI "\n\@table \@asis\n"; } + else + { print TEXI "\n\@end table\n"; } + } + elsif ($tag eq "dt") + { push_or_pop_index_deferrers($tag, $startflag); + if ($startflag) + { print TEXI "\n\@item "; } + else + { } } + elsif ($tag eq "dd") + { if ($startflag) + { print TEXI "\n"; } + else + { } + if (scalar(@index_deferrers) != 0) + { $he->dump; + die "index deferrers: ", join(" ", @index_deferrers); } + do_deferred_index_entries(); + } + elsif ($tag =~ /^(font|big|small)$/) + { # Do nothing for now. + } + elsif ($tag =~ /^h[1-6]$/) + { # We don't need this because we never recursively enter the heading content. + # push_or_pop_index_deferrers($tag, $startflag); + my $secname = ""; + my @seclabels = (); + for my $elt (@{$he->content}) + { if (!ref $elt) + { $secname .= $elt; } + elsif ($elt->tag eq "br") + { } + elsif ($elt->tag eq "a") + { my ($name, $href, @acontent) = anchor_info($elt); + if ($href) + { $he->dump; + $elt->dump; + die "Nonsimple anchor in <$tag>"; } + if (!defined $name) + { die "No NAME for anchor in $tag"; } + push @seclabels, $name; + for my $subelt (@acontent) + { $secname .= html_to_texi($subelt); } } + else + { $secname .= html_to_texi($elt); } } + if ($secname eq "") + { die "No section name in <$tag>"; } + if (scalar(@section_stack) == 1) + { if ($section_stack[-1] ne "Top") + { die "Not top? $section_stack[-1]"; } + print TEXI "\@settitle $secname\n"; + print TEXI "\@c %**end of header\n"; + print TEXI "\n"; + print TEXI "\@node Top\n"; + print TEXI "\n"; } + else + { print TEXI "\n\@node $section_stack[-1]\n"; + print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; } + for my $seclabel (@seclabels) + { label_add_index_entries($seclabel); } + # This should only happen once per file. + label_add_index_entries(""); + if (scalar(@index_deferrers) != 0) + { die "index deferrers: ", join(" ", @index_deferrers); } + do_deferred_index_entries(); + return 0; + } + elsif ($tag eq "hr") + { } + elsif ($tag eq "ignore") + { # Hack for ignored elements + return 0; + } + elsif ($tag eq "li") + { if ($startflag) + { print TEXI "\n\n\@item\n"; + do_deferred_index_entries(); } } + elsif ($tag eq "ol") + { if ($startflag) + { print TEXI "\n\@enumerate \@bullet\n"; } + else + { print TEXI "\n\@end enumerate\n"; } } + elsif ($tag eq "p") + { if ($startflag) + { print TEXI "\n\n"; } + if (scalar(@index_deferrers) != 0) + { die "index deferrers: ", join(" ", @index_deferrers); } + do_deferred_index_entries(); } + elsif ($tag eq "pre") + { print_pre($he); + return 0; } + elsif ($tag eq "table") + { # Could also indicate common formatting for first column, or + # determine relative widths for columns (or determine a prototype row) + if ($startflag) + { if (defined $table_columns) + { $he->dump; + die "Can't deal with table nested inside $table_columns-column table"; } + $table_columns = table_columns($he); + if ($table_columns < 2) + { $he->dump; + die "Column with $table_columns columns?"; } + elsif ($table_columns == 2) + { print TEXI "\n\@table \@asis\n"; } + else + { print TEXI "\n\@multitable \@columnfractions"; + for (my $i=0; $i<$table_columns; $i++) + { print TEXI " ", 1.0/$table_columns; } + print TEXI "\n"; } } + else + { if ($table_columns == 2) + { print TEXI "\n\@end table\n"; } + else + { print TEXI "\n\@end multitable\n"; } + undef $table_columns; } } + elsif (($tag eq "td") || ($tag eq "th")) + { if ($startflag) + { if ($table_first_column) + { print TEXI "\n\@item "; + $table_first_column = 0; } + elsif ($table_columns > 2) + { print TEXI "\n\@tab "; } } + else + { print TEXI "\n"; } } + elsif ($tag eq "tr") + { if ($startflag) + { $table_first_column = 1; } } + elsif ($tag eq "ul") + { if ($startflag) + { print TEXI "\n\@itemize \@bullet\n"; } + else + { print TEXI "\n\@end itemize\n"; } } + else + { print STDERR "\nBailing out\n"; + $he->dump; + return 0; } + + return 1; +} + +sub print_pre ( $ ) +{ my ($he_pre) = check_args(1, @_); + if (!has_single_content_string($he_pre)) + { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; } + my $pre_content = $ {$he_pre->content}[0]; + print TEXI "\n\@example"; + print TEXI &texi_quote($pre_content); + print TEXI "\@end example\n"; +} + +sub table_columns ( $ ) +{ my ($table) = check_args(1, @_); + my $result = 0; + for my $row (@{$table->content}) + { if ($row->tag ne "tr") + { $table->dump; + $row->dump; + die "Expected <TR> as table row."; } + $result = max($result, scalar(@{$row->content})); } + return $result; +} + + +########################################################################### +### Utilities +### + +sub min ( $$ ) +{ my ($x, $y) = check_args(2, @_); + return ($x < $y) ? $x : $y; +} + +sub max ( $$ ) +{ my ($x, $y) = check_args(2, @_); + return ($x > $y) ? $x : $y; +} + +sub file_to_tree ( $ ) +{ my ($file) = check_args(1, @_); + + my $tree = new HTML::TreeBuilder; + $tree->ignore_unknown(1); + # $tree->warn(1); + $tree->parse_file($file); + cleanup_parse_tree($tree); + return $tree +} + + +sub has_single_content ( $ ) +{ my ($he) = check_args(1, @_); + if (!ref $he) + { # return 0; + die "Non-reference argument: $he"; } + my $ref_content = $he->content; + if (!defined $ref_content) + { return 0; } + my @content = @{$ref_content}; + if (scalar(@content) != 1) + { return 0; } + return 1; +} + + +# Return true if the content of the element contains only one element itself, +# and that inner element has the specified tag. +sub has_single_content_with_tag ( $$ ) +{ my ($he, $tag) = check_args(2, @_); + if (!has_single_content($he)) + { return 0; } + my $content = $ {$he->content}[0]; + if (!ref $content) + { return 0; } + my $content_tag = $content->tag; + if (!defined $content_tag) + { return 0; } + return $content_tag eq $tag; +} + +sub has_single_content_string ( $ ) +{ my ($he) = check_args(1, @_); + if (!has_single_content($he)) + { return 0; } + my $content = $ {$he->content}[0]; + if (ref $content) + { return 0; } + return 1; +} + + +# Return name, href, content. First two may be undefined; third is an array. +# I don't see how to determine if there are more attributes. +sub anchor_info ( $ ) +{ my ($he) = check_args(1, @_); + if ($he->tag ne "a") + { $he->dump; + die "passed non-anchor to anchor_info"; } + my $name = $he->attr('name'); + my $href = $he->attr('href'); + my @content = (); + { my $ref_content = $he->content; + if (defined $ref_content) + { @content = @{$ref_content}; } } + return ($name, $href, @content); +} + + +sub texi_quote ( $ ) +{ my ($text) = check_args(1, @_); + $text =~ s/([\@\{\}])/\@$1/g; + $text =~ s/ -- / --- /g; + return $text; +} + +# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles. +sub texi_remove_punctuation ( $ ) +{ my ($text) = check_args(1, @_); + + $text =~ s/^ +//g; + $text =~ s/[ :]+$//g; + $text =~ s/^[1-9][0-9.]* +//g; + $text =~ s/,//g; + # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- " + # gets converted into " - ", just as "---" would be converted into " -- ", + # so the names end up differing.) + # $text =~ s/:/ -- /g; + $text =~ s/://g; + return $text; +} + + +## Do not use this inside `traverse': it throws off the traversal. Use +## html_replace_by_ignore or html_replace_by_meta instead. +# Returns 1 if success, 0 if failure. +sub html_remove ( $;$ ) +{ my ($he, $parent) = check_args_range(1, 2, @_); + if (!defined $parent) + { $parent = $he->parent; } + my $ref_pcontent = $parent->content; + my @pcontent = @{$ref_pcontent}; + for (my $i=0; $i<scalar(@pcontent); $i++) + { if ($pcontent[$i] eq $he) + { splice @{$ref_pcontent}, $i, 1; + $he->parent(undef); + return 1; } } + die "Didn't find $he in $parent"; +} + + +sub html_replace ( $$;$ ) +{ my ($orig, $new, $parent) = check_args_range(2, 3, @_); + if (!defined $parent) + { $parent = $orig->parent; } + my $ref_pcontent = $parent->content; + my @pcontent = @{$ref_pcontent}; + for (my $i=0; $i<scalar(@pcontent); $i++) + { if ($pcontent[$i] eq $orig) + { $ {$ref_pcontent}[$i] = $new; + $new->parent($parent); + $orig->parent(undef); + return 1; } } + die "Didn't find $orig in $parent"; +} + +sub html_replace_by_meta ( $;$ ) +{ my ($orig, $parent) = check_args_range(1, 2, @_); + my $meta = new HTML::Element "meta"; + if (!defined $parent) + { $parent = $orig->parent; } + return html_replace($orig, $meta, $parent); +} + +sub html_replace_by_ignore ( $;$ ) +{ my ($orig, $parent) = check_args_range(1, 2, @_); + my $ignore = new HTML::Element "ignore"; + if (!defined $parent) + { $parent = $orig->parent; } + return html_replace($orig, $ignore, $parent); +} + + + +### +### Collect text elements +### + +my @collected_texts; +my $collect_texts_stoppoint; +my $done_collecting; + +sub collect_texts ( $;$ ) +{ my ($root, $stop) = check_args_range(1, 2, @_); + # print STDERR "collect_texts: $root $stop\n"; + $collect_texts_stoppoint = $stop; + $done_collecting = 0; + @collected_texts = (); + $root->traverse(\&collect_if_text); # process texts + # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n"; + return @collected_texts; +} + +sub collect_if_text ( $$$ ) +{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments + if ($done_collecting) + { return 0; } + if (!defined $he) + { return 0; } + if (!ref $he) + { push @collected_texts, $he; + return 0; } + if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint)) + { $done_collecting = 1; + return 0; } + return 1; +} + + +########################################################################### +### Clean up parse tree +### + +sub cleanup_parse_tree ( $ ) +{ my ($he) = check_args(1, @_); + $he->traverse(\&delete_if_navigation, 'ignore text'); + $he->traverse(\&delete_extra_spaces, 'ignore text'); + $he->traverse(\&merge_dl, 'ignore text'); + return $he; +} + + +## Simpler version that deletes contents but not the element itself. +# sub delete_if_navigation ( $$$ ) +# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth +# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation')) +# { $he->delete(); +# return 0; } +# else +# { return 1; } +# } + +sub delete_if_navigation ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation')) + { my $ref_pcontent = $he->parent()->content(); + # Don't try to modify @pcontent, which appears to be a COPY. + # my @pcontent = @{$ref_pcontent}; + for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++) + { if (${$ref_pcontent}[$i] eq $he) + { splice(@{$ref_pcontent}, $i, 1); + last; } } + $he->delete(); + return 0; } + else + { return 1; } +} + +sub delete_extra_spaces ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + my $tag = $he->tag; + if ($tag =~ /^(head|html|table|tr|ul)$/) + { delete_child_spaces($he); } + delete_trailing_spaces($he); + return 1; +} + + +sub delete_child_spaces ( $ ) +{ my ($he) = check_args(1, @_); + my $ref_content = $he->content(); + for (my $i = 0; $i<scalar(@{$ref_content}); $i++) + { if ($ {$ref_content}[$i] =~ /^ *$/) + { splice(@{$ref_content}, $i, 1); + $i--; } } +} + +sub delete_trailing_spaces ( $ ) +{ my ($he) = check_args(1, @_); + my $ref_content = $he->content(); + if (! defined $ref_content) + { return; } + # Could also check for previous element = /^h[1-6]$/. + for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++) + { if ($ {$ref_content}[$i] =~ /^ *$/) + { my $next_elt = $ {$ref_content}[$i+1]; + if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/)) + { splice(@{$ref_content}, $i, 1); + $i--; } } } + if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/) + { my $last_elt = $ {$ref_content}[$#{$ref_content}]; + if ((defined $last_elt) && ($last_elt =~ /^ *$/)) + { pop @{$ref_content}; } } +} + + +# If we find a paragraph that looks like +# <P> +# <HR> +# <UL> +# then accumulate its links into a contents_list and delete the paragraph. +sub process_if_child_links ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + if ($he->tag() eq "p") + { my $ref_content = $he->content(); + if (defined $ref_content) + { my @content = @{$ref_content}; + if ((scalar(@content) == 2) + && (ref $content[0]) && $content[0]->tag() eq "hr" + && (ref $content[1]) && $content[1]->tag() eq "ul") + { process_child_links($he); + $he->delete(); + return 0; } } } + return 1; +} + + +# If we find +# <H4> +# "Footnotes" +# <DL> +# <DT> +# <A NAME="foot560"> +# "...borrow" +# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560"> +# "1.2" +# <DD> +# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. " +# ... +# then record the footnote information and delete the section and list. + +my $process_if_footnotes_expect_dl_next = 0; + +sub process_if_footnotes ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + if (($he->tag() eq "h4") + && has_single_content_string($he) + && ($ {$he->content}[0] eq "Footnotes")) + { html_replace_by_ignore($he); + $process_if_footnotes_expect_dl_next = 1; + return 0; } + + if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl")) + { my $ref_content = $he->content(); + if (defined $ref_content) + { $process_if_footnotes_expect_dl_next = 0; + my @content = @{$ref_content}; + for (my $i=0; $i<$#content; $i+=2) + { my $he_dt = $content[$i]; + my $he_dd = $content[$i+1]; + if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd")) + { $he->dump; + die "expected <DT> and <DD> at positions $i and ", $i+1; } + my @dt_content = @{$he_dt->content()}; + if ((scalar(@dt_content) != 2) + || ($dt_content[0]->tag ne "a") + || ($dt_content[1]->tag ne "a")) + { $he_dt->dump; + die "Expected 2 anchors as content of <DT>"; } + my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]); + my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]); + # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content + if ($dt1_name ne $dt2_name) + { $he_dt->dump; + die "Expected identical names for anchors"; } + html_replace_by_ignore($he_dd); + $he_dd->tag("div"); # has no effect + $footnotes{$dt1_name} = $he_dd; } + html_replace_by_ignore($he); + return 0; } } + + if ($process_if_footnotes_expect_dl_next) + { $he->dump; + die "Expected <DL> for footnotes next"; } + + return 1; +} + + + +## Merge two adjacent paragraphs containing <DL> items, such as: +# <P> +# <DL> +# <DT> +# ... +# <DD> +# ... +# <P> +# <DL> +# <DT> +# ... +# <DD> +# ... + +sub merge_dl ( $$$ ) +{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument + if (!$startflag) + { return; } + + my $ref_content = $he->content; + if (!defined $ref_content) + { return; } + my $i = 0; + while ($i < scalar(@{$ref_content})-1) + { my $p1 = $ {$ref_content}[$i]; + if ((ref $p1) && ($p1->tag eq "p") + && has_single_content_with_tag($p1, "dl")) + { my $dl1 = $ {$p1->content}[0]; + # In this loop, rhs, not lhs, of < comparison changes, + # because we are removing elements from the content of $he. + while ($i < scalar(@{$ref_content})-1) + { my $p2 = $ {$ref_content}[$i+1]; + if (!((ref $p2) && ($p2->tag eq "p") + && has_single_content_with_tag($p2, "dl"))) + { last; } + # Merge these two elements. + splice(@{$ref_content}, $i+1, 1); # remove $p2 + my $dl2 = $ {$p2->content}[0]; + $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1 + } + # extra increment because next element isn't a candidate for $p1 + $i++; } + $i++; } + return 1; +} + + + +########################################################################### +### Testing +### + +sub test ( $$ ) +{ my ($action, $file) = check_args(2, @_); + + # General testing + if (($action eq "view") || ($action eq "")) + { # # $file = "/homes/gws/mernst/www/links.html"; + # # $file = "/homes/gws/mernst/www/index.html"; + # # $file = "/homes/fish/mernst/java/gud/doc/manual.html"; + # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html"; + # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html"; + # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; + my $tree = file_to_tree($file); + + ## Testing + # print STDERR $tree->as_HTML; + $tree->dump(); + + # print STDERR $tree->tag(), "\n"; + # print STDERR @{$tree->content()}, "\n"; + # + # for (@{ $tree->extract_links(qw(a img)) }) { + # my ($link, $linkelem) = @$_; + # print STDERR "$link ", $linkelem->as_HTML; + # } + # + # print STDERR @{$tree->extract_links()}, "\n"; + + # my @top_level_elts = @{$tree->content()}; + + # if scalar(@{$tree->content()}) + return; + } + + elsif ($action eq "raw") + { my $tree = new HTML::TreeBuilder; + $tree->ignore_unknown(1); + # $tree->warn(1); + $tree->parse_file($file); + + $tree->dump(); + + # cleanup_parse_tree($tree); + # $tree->dump(); + return; + } + + # Test dealing with a section. + elsif ($action eq "section") + { # my $file; + # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html"; + # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html"; + # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; + process_section_file($file, 0, "Title"); + } + + # Test dealing with many sections + elsif (0) + { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html", + "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html", + "/homes/fish/mernst/tmp/python-doc/html/api/api.html", + "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html", + # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html", + "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html", + "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html", + "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html", + "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html", + "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html", + "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/front.html", + "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html", + # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html", + "/homes/fish/mernst/tmp/python-doc/html/api/importing.html", + "/homes/fish/mernst/tmp/python-doc/html/api/includes.html", + "/homes/fish/mernst/tmp/python-doc/html/api/index.html", + "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html", + "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/intro.html", + "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html", + "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html", + "/homes/fish/mernst/tmp/python-doc/html/api/node24.html", + "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html", + "/homes/fish/mernst/tmp/python-doc/html/api/number.html", + "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/object.html", + "/homes/fish/mernst/tmp/python-doc/html/api/objects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/os.html", + "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html", + "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html", + "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html", + "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html", + "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html", + "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/threads.html", + "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html", + "/homes/fish/mernst/tmp/python-doc/html/api/types.html", + "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html", + "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html"); + for my $file (@files) + { print STDERR "\n", "=" x 75, "\n", "$file:\n"; + process_section_file($file, 0, "Title"); + } + } + + # Test dealing with index. + elsif ($action eq "index") + { # my $file; + # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html"; + + process_index_file($file, "\@cindex"); + print_index_info(); + } + + else + { die "Unrecognized action `$action'"; } +} + + +########################################################################### +### Main loop +### + +sub process_contents_file ( $ ) +{ my ($file) = check_args(1, @_); + + # could also use File::Basename + my $info_file = $file; + $info_file =~ s/(\/?index)?\.html$//; + if ($info_file eq "") + { chomp($info_file = `pwd`); } + $info_file =~ s/^.*\///; # not the most efficient way to remove dirs + + $html_directory = $file; + $html_directory =~ s/(\/|^)[^\/]+$/$1/; + + my $texi_file = "$info_file.texi"; + open(TEXI, ">$texi_file"); + + print TEXI "\\input texinfo \@c -*-texinfo-*-\n"; + print TEXI "\@c %**start of header\n"; + print TEXI "\@setfilename $info_file\n"; + + # 2. Summary Description and Copyright + # The "Summary Description and Copyright" segment describes the + # document and contains the copyright notice and copying permissions + # for the Info file. The segment must be enclosed between `@ifinfo' + # and `@end ifinfo' commands so that the formatters place it only in + # the Info file. + # + # The summary description and copyright segment does not appear in the + # printed document. + # + # @ifinfo + # This is a short example of a complete Texinfo file. + # + # Copyright @copyright{} 1990 Free Software Foundation, Inc. + # @end ifinfo + + + # 3. Title and Copyright + # The "Title and Copyright" segment contains the title and copyright + # pages and copying permissions for the printed manual. The segment + # must be enclosed between `@titlepage' and `@end titlepage' + # commands. The title and copyright page appear only in the printed + # manual. + # + # The titlepage segment does not appear in the Info file. + # + # @titlepage + # @sp 10 + # @comment The title is printed in a large font. + # @center @titlefont{Sample Title} + # + # @c The following two commands start the copyright page. + # @page + # @vskip 0pt plus 1filll + # Copyright @copyright{} 1990 Free Software Foundation, Inc. + # @end titlepage + + + # 4. `Top' Node and Master Menu + # The "Master Menu" contains a complete menu of all the nodes in the + # whole Info file. It appears only in the Info file, in the `Top' + # node. + # + # The `Top' node contains the master menu for the Info file. Since a + # printed manual uses a table of contents rather than a menu, the master + # menu appears only in the Info file. + # + # @node Top, First Chapter, , (dir) + # @comment node-name, next, previous, up + # + # @menu + # * First Chapter:: The first chapter is the + # only chapter in this sample. + # * Concept Index:: This index has two entries. + # @end menu + + + + $current_ref_tdf = [ "Top", 0, $ARGV[0] ]; + process_section_file($file, 0, "Top"); + while (scalar(@contents_list)) + { $current_ref_tdf = shift @contents_list; + process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]); + } + + print TEXI "\n"; + for my $indextitle (@index_titles) + { print TEXI "\@node $indextitle\n"; + print TEXI "\@unnumbered $indextitle\n"; + print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n"; + print TEXI "\n"; } + + print TEXI "\@contents\n"; + print TEXI "\@bye\n"; + close(TEXI); +} + +# This needs to be last so global variable initializations are reached. + +if (scalar(@ARGV) == 0) +{ die "No arguments supplied to html2texi.pl"; } + +if ($ARGV[0] eq "-test") +{ my @test_args = @ARGV[1..$#ARGV]; + if (scalar(@test_args) == 0) + { test("", "index.html"); } + elsif (scalar(@test_args) == 1) + { test("", $test_args[0]); } + elsif (scalar(@test_args) == 2) + { test($test_args[0], $test_args[1]); } + else + { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); } + exit(); +} + +if (scalar(@ARGV) != 1) +{ die "Pass one argument, the main/contents page"; } + +process_contents_file($ARGV[0]); |