New conversion tools for HTML->info from Michael Ernst

<mernst@cs.washington.edu>. Thanks!
author: Fred Drake <fdrake@acm.org> 1999-01-08 15:25:29 (GMT)
committer: Fred Drake <fdrake@acm.org> 1999-01-08 15:25:29 (GMT)
commit: 3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8 (patch)
tree: 0074eae07130a12e34af18ec6c354367f653c19e
parent: 41498435ba1f2901e2e3498d7f7f5c773a602568 (diff)
download: cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.zip
cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.gz
cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.bz2
2 files changed, 1778 insertions, 0 deletions
diff --git a/Doc/tools/checkargs.pm b/Doc/tools/checkargs.pm
new file mode 100644
index 0000000..de52f69
--- /dev/null
+++ b/Doc/tools/checkargs.pm
@@ -0,0 +1,112 @@
+#!/uns/bin/perl
+
+package checkargs;
+require 5.004;			# uses "for my $var"
+require Exporter;
+@ISA = qw(Exporter);
+@EXPORT = qw(check_args check_args_range check_args_at_least);
+use strict;
+use Carp;
+
+=head1 NAME
+
+checkargs -- Provide rudimentary argument checking for perl5 functions
+
+=head1 SYNOPSIS
+
+  check_args(cArgsExpected, @_)
+  check_args_range(cArgsMin, cArgsMax, @_)
+  check_args_at_least(cArgsMin, @_)
+where "@_" should be supplied literally.
+
+=head1 DESCRIPTION
+
+As the first line of user-written subroutine foo, do one of the following:
+
+  my ($arg1, $arg2) = check_args(2, @_);
+  my ($arg1, @rest) = check_args_range(1, 4, @_);
+  my ($arg1, @rest) = check_args_at_least(1, @_);
+  my @args = check_args_at_least(0, @_);
+
+These functions may also be called for side effect (put a call to one
+of the functions near the beginning of the subroutine), but using the
+argument checkers to set the argument list is the recommended usage.
+
+The number of arguments and their definedness are checked; if the wrong
+number are received, the program exits with an error message.
+
+=head1 AUTHOR
+
+Michael D. Ernst <F<mernst@cs.washington.edu>>
+
+=cut
+
+## Need to check that use of caller(1) really gives desired results.
+## Need to give input chunk information.
+## Is this obviated by Perl 5.003's declarations?  Not entirely, I think.
+
+sub check_args ( $@ )
+{
+  my ($num_formals, @args) = @_;
+  my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
+  if (@_ < 1) { croak "check_args needs at least 7 args, got ", scalar(@_), ": @_\n "; }
+  if ((!wantarray) && ($num_formals != 0))
+    { croak "check_args called in scalar context"; }
+  # Can't use croak below here: it would only go out to caller, not its caller
+  my $num_actuals = @args;
+  if ($num_actuals != $num_formals)
+    { die "$file_arg:$line_arg: function $subname expected $num_formals argument",
+      (($num_formals == 1) ? "" : "s"),
+      ", got $num_actuals",
+      (($num_actuals == 0) ? "" : ": @args"),
+      "\n"; }
+  for my $index (0..$#args)
+    { if (!defined($args[$index]))
+	{ die "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; } }
+  return @args;
+}
+
+sub check_args_range ( $$@ )
+{
+  my ($min_formals, $max_formals, @args) = @_;
+  my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
+  if (@_ < 2) { croak "check_args_range needs at least 8 args, got ", scalar(@_), ": @_"; }
+  if ((!wantarray) && ($max_formals != 0) && ($min_formals !=0) )
+    { croak "check_args_range called in scalar context"; }
+  # Can't use croak below here: it would only go out to caller, not its caller
+  my $num_actuals = @args;
+  if (($num_actuals < $min_formals) || ($num_actuals > $max_formals))
+    { die "$file_arg:$line_arg: function $subname expected $min_formals-$max_formals arguments, got $num_actuals",
+      ($num_actuals == 0) ? "" : ": @args", "\n"; }
+  for my $index (0..$#args)
+    { if (!defined($args[$index]))
+	{ die "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; } }
+  return @args;
+}
+
+sub check_args_at_least ( $@ )
+{
+  my ($min_formals, @args) = @_;
+  my ($pack, $file_arg, $line_arg, $subname, $hasargs, $wantarr) = caller(1);
+  # Don't do this, because we want every sub to start with a call to check_args*
+  # if ($min_formals == 0)
+  #   { die "Isn't it pointless to check for at least zero args to $subname?\n"; }
+  if (scalar(@_) < 1)
+    { croak "check_args_at_least needs at least 1 arg, got ", scalar(@_), ": @_"; }
+  if ((!wantarray) && ($min_formals != 0))
+    { croak "check_args_at_least called in scalar context"; }
+  # Can't use croak below here: it would only go out to caller, not its caller
+  my $num_actuals = @args;
+  if ($num_actuals < $min_formals)
+    { die "$file_arg:$line_arg: function $subname expected at least $min_formals argument",
+      ($min_formals == 1) ? "" : "s",
+      ", got $num_actuals",
+      ($num_actuals == 0) ? "" : ": @args", "\n"; }
+  for my $index (0..$#args)
+    { if (!defined($args[$index]))
+	{ warn "$file_arg:$line_arg: function $subname undefined argument ", $index+1, ": @args[0..$index-1]\n"; last; } }
+  return @args;
+}
+
+1;				# successful import
+__END__
diff --git a/Doc/tools/html2texi.pl b/Doc/tools/html2texi.pl
new file mode 100755
index 0000000..cf3595f
--- /dev/null
+++ b/Doc/tools/html2texi.pl
@@ -0,0 +1,1666 @@
+#! /usr/bin/env perl -w
+# html2texi.pl -- Convert HTML documentation to Texinfo format
+# Michael Ernst <mernst@cs.washington.edu>
+# Time-stamp: <1998-09-10 12:52:38 mernst>
+
+# This program converts HTML documentation trees into Texinfo format.
+# Given the name of a main (or contents) HTML file, it processes that file,
+# and other files (transitively) referenced by it, into a Texinfo file
+# (whose name is chosen from the file or directory name of the argument).
+# For instance:
+#   html2texi.pl api/index.pl
+# produces file "api.texi".
+
+# Texinfo format can be easily converted to Info format (for browsing in
+# Emacs or the standalone Info browser), to a printed manual, or to HTML.
+# Thus, html2texi.pl permits conversion of HTML files to Info format, and
+# secondarily enables producing printed versions of Web page hierarchies.
+
+# Unlike HTML, Info format is searchable.  Since Info is integrated into
+# Emacs, one can read documentation without starting a separate Web
+# browser.  Additionally, Info browsers (including Emacs) contain
+# convenient features missing from Web browsers, such as easy index lookup
+# and mouse-free browsing.
+
+# Limitations:
+# html2texi.pl is currently tuned to latex2html output, but should be
+# extensible to arbitrary HTML documents.  It will be most useful for HTML
+# with a hierarchical structure and an index.  The HTML tree to be
+# traversed must be on local disk, rather than being accessed via HTTP.
+# This script requires the use of "checkargs.pm".  To eliminate that
+# dependence, replace calls to check_args* by @_ (which is always the last
+# argument to those functions).
+# Also see the "to do" section, below.
+# Comments, suggestions, bug fixes, and enhancements are welcome.
+
+###
+### Typical usage for the Python documentation:
+###
+
+# (Actually, most of this is in a Makefile instead.)
+# The resulting Info format Python documentation is currently available at
+# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
+
+# Fix up HTML problems, eg <DL COMPACT><DD>
+
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
+# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
+
+# Edit the generated .texi files:
+#   * change @setfilename to prefix "python-"
+#   * fix up any sectioning, such as for Abstract
+#   * make Texinfo menus
+#   * perhaps remove the @detailmenu ... @end detailmenu
+# In Emacs:
+#   (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
+
+# makeinfo api.texi
+# makeinfo ext.texi
+# makeinfo lib.texi
+# makeinfo mac.texi
+# makeinfo ref.texi
+# makeinfo tut.texi
+
+
+###
+### Structure of the code
+###
+
+# To be written...
+
+
+###
+### Design decisions
+###
+
+# Source and destination languages
+# --------------------------------
+# 
+# The goal is Info files; I create Texinfo, so I don't have to worry about
+# the finer details of Info file creation.  (I'm not even sure of its exact
+# format.)
+# 
+# Why not start from LaTeX rather than HTML?
+# I could hack latex2html itself to produce Texinfo instead, or fix up
+# partparse.py (which already translates LaTeX to Teinfo).
+#  Pros:
+#   * has high-level information such as index entries, original formatting
+#  Cons:
+#   * those programs are complicated to read and understand
+#   * those programs try to handle arbitrary LaTeX input, track catcodes,
+#     and more:  I don't want to go to that effort.  HTML isn't as powerful
+#     as LaTeX, so there are fewer subtleties.
+#   * the result wouldn't work for arbitrary HTML documents; it would be
+#     nice to eventually extend this program to HTML produced from Docbook,
+#     Frame, and more.
+
+# Parsing
+# -------
+# 
+# I don't want to view the text as a linear stream; I'd rather parse the
+# whole thing and then do pattern matching over the parsed representation (to
+# find idioms such as indices, lists of child nodes, etc.).
+#  * Perl provides HTML::TreeBuilder, which does just what I want.
+#     * libwww-perl: http://www.linpro.no/lwp/
+#     * TreeBuilder: HTML-Tree-0.51.tar.gz
+#  * Python Parsers, Formatters, and Writers don't really provide the right
+#    interface (and the version in Grail doesn't correspond to another
+#    distributed version, so I'm confused about which to be using).  I could
+#    write something in Python that creates a parse tree, but why bother?
+
+# Other implementation language issues:
+#  * Python lacks variable declarations, reasonable scoping, and static
+#    checking tools.  I've written some of the latter for myself that make
+#    my Perl programming a lot safer than my Python programming will be until
+#    I have a similar suite for that language.
+
+
+###########################################################################
+### To do
+###
+
+# Section names:
+#   Fix the problem with multiple sections in a single file (eg, Abstract in
+#     Front Matter section).
+#   Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
+# Index:
+#   Perhaps double-check that every tag mentioned in the index is found
+#     in the text.
+# Python:  email to python-docs@python.org, to get their feedback.
+#   Compare to existing lib/ Info manual
+#   Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
+#   Postpass to remove extra quotation marks around typography already in
+#     a different font (to avoid double delimiters as in "`code'"); or
+#     perhaps consider using only font-based markup so that we don't get
+#     the extra *bold* and `code' markup in Info.
+
+## Perhaps don't rely on automatic means for adding up, next, prev; I have
+## all that info available to me already, so it's not so much trouble to
+## add it.  (Right?)  But it is *so* easy to use Emacs instead...
+
+
+###########################################################################
+### Strictures
+###
+
+# man HTML::TreeBuilder
+# man HTML::Parser
+# man HTML::Element
+
+# require HTML::ParserWComment;
+require HTML::Parser;
+require HTML::TreeBuilder;
+require HTML::Element;
+
+use File::Basename;
+use Cwd;
+
+use strict;
+# use Carp;
+
+
+use checkargs;
+
+
+###########################################################################
+### Variables
+###
+
+my @section_stack = ();		# elements are chapter/section/subsec nodetitles (I think)
+my $current_ref_tdf;		# for the file currently being processed;
+				#  used in error messages
+my $html_directory;
+my %footnotes;
+
+# First element should not be used.
+my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
+
+my %inline_markup = ("b" => "strong",
+		     "code" => "code",
+		     "i" => "emph",
+		     "kbd" => "kbd",
+		     "samp" => "samp",
+		     "strong" => "strong",
+		     "tt" => "code",
+		     "var" => "var");
+
+my @deferred_index_entries = ();
+
+my @index_titles = ();		# list of (filename, type) lists
+my %index_info = ("Index" => ["\@blindex", "bl"],
+		  "Concept Index" => ["\@cindex", "cp"],
+		  "Module Index" => ["\@mdindex", "md"]);
+
+
+###########################################################################
+### Main/contents page
+###
+
+# Process first-level page on its own, or just a contents page?  Well, I do
+# want the title, author, etc., and the front matter...  For now, just add
+# that by hand at the end.
+
+
+# data structure possibilities:
+#  * tree-like (need some kind of stack when processing (or parent pointers))
+#  * list of name and depth; remember old and new depths.
+
+# Each element is a reference to a list of (nodetitle, depth, filename).
+my @contents_list = ();
+
+# The problem with doing fixups on the fly is that some sections may have
+# already been processed (and no longer available) by the time we notice
+# others with the same name.  It's probably better to fully construct the
+# contents list (reading in all files of interest) upfront; that will also
+# let me do a better job with cross-references, because again, all files
+# will already be read in.
+my %contents_hash = ();
+my %contents_fixups = ();
+
+my @current_contents_list = ();
+
+# Merge @current_contents_list into @contents_list,
+# and set @current_contents_list to be empty.
+sub merge_contents_lists ( )
+{ check_args(0, @_);
+
+  # Three possibilities:
+  #  * @contents_list is empty: replace it by @current_contents_list.
+  #  * prefixes of the two lists are identical: do nothing
+  #  * @current_contents_list is all at lower level than $contents_list[0];
+  #    prefix @contents_list by @current_contents_list
+
+  if (scalar(@current_contents_list) == 0)
+    { die "empty current_contents_list"; }
+
+  #   if (scalar(@contents_list) == 0)
+  #     { @contents_list = @current_contents_list;
+  #       @current_contents_list = ();
+  #       return; }
+
+  #   if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
+  #     { unshift @contents_list, @current_contents_list;
+  #       @current_contents_list = ();
+  #       return; }
+
+  for (my $i=0; $i<scalar(@current_contents_list); $i++)
+    { my $ref_c_tdf = $current_contents_list[$i];
+      if ($i >= scalar(@contents_list))
+	{ push @contents_list, $ref_c_tdf;
+	  my $title = $ {$ref_c_tdf}[0];
+	  if (defined $contents_hash{$title})
+	    { $contents_fixups{$title} = 1; }
+	  else
+	    { $contents_hash{$title} = 1; }
+	  next; }
+      my $ref_tdf = $contents_list[$i];
+      my ($title, $depth, $file) = @{$ref_tdf};
+      my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
+
+      if (($title ne $c_title)
+	  && ($depth < $c_depth)
+	  && ($file ne $c_file))
+	{ splice @contents_list, $i, 0, $ref_c_tdf;
+	  if (defined $contents_hash{$c_title})
+	    { $contents_fixups{$c_title} = 1; }
+	  else
+	    { $contents_hash{$c_title} = 1; }
+	  next; }
+
+      if (($title ne $c_title)
+	  || ($depth != $c_depth)
+	  || ($file ne $c_file))
+	{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
+	       "\n  main:  <<<$title>>> $depth $file",
+	       "\n  curr:  <<<$c_title>>> $c_depth $c_file"); }
+    }
+  @current_contents_list = ();
+}
+
+
+
+# Set @current_contents_list to a list of (title, href, sectionlevel);
+#  then merge that list into @contents_list.
+# Maybe this function should also produce a map
+#  from title (or href) to sectionlevel (eg "chapter"?).
+sub process_child_links ( $ )
+{ my ($he) = check_args(1, @_);
+
+  # $he->dump;
+  if (scalar(@current_contents_list) != 0)
+    { die "current_contents_list nonempty: @current_contents_list"; }
+  $he->traverse(\&increment_current_contents_list, 'ignore text');
+
+  # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
+  my %depths = ();
+  for my $ref_tdf (@current_contents_list)
+    { $depths{$ {$ref_tdf}[1]} = 1; }
+  my @sorted_depths = sort keys %depths;
+  my $current_depth = scalar(@section_stack)-1;
+  my $current_depth_2 = $ {$current_ref_tdf}[1];
+  if ($current_depth != $current_depth_2)
+    { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
+  for (my $i=0; $i<scalar(@sorted_depths); $i++)
+    { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
+  for my $ref_tdf (@current_contents_list)
+    { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
+
+  # Eliminate uninteresting sections.  Hard-coded hack for now.
+  if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
+    { pop @current_contents_list; }
+  if ((scalar(@current_contents_list) > 1)
+      && ($ {$current_contents_list[1]}[0] eq "Contents"))
+    { my $ref_first_tdf = shift @current_contents_list;
+      $current_contents_list[0] = $ref_first_tdf; }
+
+  for (my $i=0; $i<scalar(@current_contents_list); $i++)
+    { my $ref_tdf = $current_contents_list[$i];
+      my $title = $ {$ref_tdf}[0];
+      if (exists $index_info{$title})
+	{ my $index_file = $ {$ref_tdf}[2];
+	  my ($indexing_command, $suffix) = @{$index_info{$title}};
+	  process_index_file($index_file, $indexing_command);
+	  print TEXI "\n\@defindex $suffix\n";
+	  push @index_titles, $title;
+	  splice @current_contents_list, $i, 1;
+	  $i--; }
+      elsif ($title =~ /\bIndex$/)
+	{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
+
+  merge_contents_lists();
+
+  # print_contents_list();
+  # print_index_info();
+}
+
+
+sub increment_current_contents_list ( $$$ )
+{ my ($he, $startflag, $depth) = check_args(3, @_);
+  if (!$startflag)
+    { return; }
+
+  if ($he->tag eq "li")
+    { my @li_content = @{$he->content};
+      if ($li_content[0]->tag ne "a")
+	{ die "first element of <LI> should be <A>"; }
+      my ($name, $href, @content) = anchor_info($li_content[0]);
+      # unused $name
+      my $title = join("", collect_texts($li_content[0]));
+      $title = texi_remove_punctuation($title);
+      # The problem with these is that they are formatted differently in
+      # @menu and @node!
+      $title =~ s/``/\"/g;
+      $title =~ s/''/\"/g;
+      $title =~ s/ -- / /g;
+      push @current_contents_list, [ $title, $depth, $href ]; }
+  return 1;
+}
+
+# Simple version for section titles
+sub html_to_texi ( $ )
+{ my ($he) = check_args(1, @_);
+  if (!ref $he)
+    { return $he; }
+
+  my $tag = $he->tag;
+  if (exists $inline_markup{$tag})
+    { my $result = "\@$inline_markup{$tag}\{";
+      for my $elt (@{$he->content})
+	{ $result .= html_to_texi($elt); }
+      $result .= "\}";
+      return $result; }
+  else
+    { $he->dump;
+      die "html_to_texi confused by <$tag>"; }
+}
+
+
+
+sub print_contents_list ()
+{ check_args(0, @_);
+  print STDERR "Contents list:\n";
+  for my $ref_tdf (@contents_list)
+    { my ($title, $depth, $file) = @{$ref_tdf};
+      print STDERR "$title $depth $file\n"; }
+}
+
+
+
+###########################################################################
+### Index
+###
+
+my $l2h_broken_link_name = "l2h-";
+
+
+# map from file to (map from anchor name to (list of index texts))
+# (The list is needed when a single LaTeX command like \envvar
+# expands to multiple \index commands.)
+my %file_index_entries = ();
+my %this_index_entries;		# map from anchor name to (list of index texts)
+
+my %file_index_entries_broken = (); # map from file to (list of index texts)
+my @this_index_entries_broken;
+
+my $index_prefix = "";
+my @index_prefixes = ();
+
+my $this_indexing_command;
+
+sub print_index_info ()
+{ check_args(0, @_);
+  my ($key, $val);
+  for my $file (sort keys %file_index_entries)
+    { my %index_entries = %{$file_index_entries{$file}};
+      print STDERR "file: $file\n";
+      for my $aname (sort keys %index_entries)
+	{ my @entries = @{$index_entries{$aname}};
+	  if (scalar(@entries) == 1)
+	    { print STDERR "  $aname : $entries[0]\n"; }
+	  else
+	    { print STDERR "  $aname : ", join("\n     " . (" " x length($aname)), @entries), "\n"; } } }
+  for my $file (sort keys %file_index_entries_broken)
+    { my @entries = @{$file_index_entries_broken{$file}};
+      print STDERR "file: $file\n";
+      for my $entry (@entries)
+	{ print STDERR "  $entry\n"; }
+    }
+}
+
+
+sub process_index_file ( $$ )
+{ my ($file, $indexing_command) = check_args(2, @_);
+  # print "process_index_file $file $indexing_command\n";
+
+  my $he = file_to_tree($html_directory . $file);
+  # $he->dump();
+
+  $this_indexing_command = $indexing_command;
+  $he->traverse(\&process_if_index_dl_compact, 'ignore text');
+  undef $this_indexing_command;
+  # print "process_index_file done\n";
+}
+
+
+sub process_if_index_dl_compact ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  if (($he->tag() eq "dl") && (defined $he->attr('compact')))
+    { process_index_dl_compact($he);
+      return 0; }
+  else
+    { return 1; }
+}
+
+
+# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
+#  * a single space: text to be ignored
+#  * <DT> elements with an optional <DD> element following each one
+#    Two types of <DT> elements:
+#     * Followed by a <DD> element:  the <DT> contains a single
+#       string, and the <DD> contains a whitespace string to be ignored, a
+#       <DL COMPACT> to be recursively processed (with the <DT> string as a
+#       prefix), and a whitespace string to be ignored.
+#     * Not followed by a <DD> element:  contains a list of anchors
+#       and texts (ignore the texts, which are only whitespace and commas).
+#       Optionally contains a <DL COMPACT> to be recursively processed (with
+#       the <DT> string as a prefix)
+sub process_index_dl_compact ( $ )
+{ my ($h) = check_args(1, @_);
+  my @content = @{$h->content()};
+  for (my $i = 0; $i < scalar(@content); $i++)
+    { my $this_he = $content[$i];
+      if ($this_he->tag ne "dt")
+	{ $this_he->dump;
+	  die "Expected <DT> tag: " . $this_he->tag; }
+      if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
+	{ process_index_dt_and_dd($this_he, $content[$i+1]);
+	  $i++;	}
+      else
+	{ process_index_lone_dt($this_he); } } }
+
+
+
+# Argument is a <DT> element.  If it contains more than one anchor, then
+# the texts of all subsequent ones are "[Link]".  Example:
+#       <DT>
+#         <A HREF="embedding.html#l2h-201">
+#           "$PATH"
+#         ", "
+#         <A HREF="embedding.html#l2h-205">
+#           "[Link]"
+# Optionally contains a <DL COMPACT> as well.  Example:
+# <DT>
+#   <A HREF="types.html#l2h-616">
+#     "attribute"
+#   <DL COMPACT>
+#     <DT>
+#       <A HREF="assignment.html#l2h-3074">
+#         "assignment"
+#       ", "
+#       <A HREF="assignment.html#l2h-3099">
+#         "[Link]"
+#     <DT>
+#       <A HREF="types.html#l2h-">
+#         "assignment, class"
+
+sub process_index_lone_dt ( $ )
+{ my ($dt) = check_args(1, @_);
+  my @dtcontent = @{$dt->content()};
+  my $acontent;
+  my $acontent_suffix;
+  for my $a (@dtcontent)
+    { if ($a eq ", ")
+	{ next; }
+      if (!ref $a)
+	{ $dt->dump;
+	  die "Unexpected <DT> string element: $a"; }
+
+      if ($a->tag eq "dl")
+	{ push @index_prefixes, $index_prefix;
+	  if (!defined $acontent_suffix)
+	    { die "acontent_suffix not yet defined"; }
+	  $index_prefix .= $acontent_suffix . ", ";
+	  process_index_dl_compact($a);
+	  $index_prefix = pop(@index_prefixes);
+	  return; }
+
+      if ($a->tag ne "a")
+	{ $dt->dump;
+	  $a->dump;
+	  die "Expected anchor in lone <DT>"; }
+
+      my ($aname, $ahref, @acontent) = anchor_info($a);
+      # unused $aname
+      if (scalar(@acontent) != 1)
+	{ die "Expected just one content of <A> in <DT>: @acontent"; }
+      if (ref $acontent[0])
+	{ $acontent[0]->dump;
+	  die "Expected string content of <A> in <DT>: $acontent[0]"; }
+      if (!defined($acontent))
+	{ $acontent = $index_prefix . $acontent[0];
+	  $acontent_suffix = $acontent[0]; }
+      elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
+	{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
+
+      if (!defined $ahref)
+	{ $dt->dump;
+	  die "no HREF in nachor in <DT>"; }
+      my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
+      if (!defined $ahref_name)
+	{ # Reference to entire file
+	  $ahref_name = ""; }
+
+      if ($ahref_name eq $l2h_broken_link_name)
+	{ if (!exists $file_index_entries_broken{$ahref_file})
+	    { $file_index_entries_broken{$ahref_file} = []; }
+	  push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
+	  next; }
+
+      if (!exists $file_index_entries{$ahref_file})
+	{ $file_index_entries{$ahref_file} = {}; }
+      # Don't do this!  It appears to make a copy, which is not desired.
+      # my %index_entries = %{$file_index_entries{$ahref_file}};
+      if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
+	{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
+      # 	{ my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
+      # 	  if ($acontent eq $oldcontent)
+      # 	    { die "Multiple identical index entries?"; }
+      # 	  die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
+
+      push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
+      # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
+    }
+}
+
+sub process_index_dt_and_dd ( $$ )
+{ my ($dt, $dd) = check_args(2, @_);
+  my $dtcontent;
+  { my @dtcontent = @{$dt->content()};
+    if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
+      { $dd->dump;
+	$dt->dump;
+	die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
+    $dtcontent = $dtcontent[0];
+    $dtcontent =~ s/ +$//; }
+  my $ddcontent;
+  { my @ddcontent = @{$dd->content()};
+    if (scalar(@ddcontent) != 1)
+      { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
+    $ddcontent = $ddcontent[0]; }
+  if ($ddcontent->tag ne "dl")
+    { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
+
+  push @index_prefixes, $index_prefix;
+  $index_prefix .= $dtcontent . ", ";
+  process_index_dl_compact($ddcontent);
+  $index_prefix = pop(@index_prefixes);
+}
+
+
+###########################################################################
+### Ordinary sections
+###
+
+sub process_section_file ( $$$ )
+{ my ($file, $depth, $nodetitle) = check_args(3, @_);
+  my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
+
+  # print STDERR "process_section_file: $file $depth $nodetitle\n";
+
+  # Equivalently:
+  #   while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
+  @section_stack = @section_stack[0..$depth-1];
+
+  # Not a great nodename fixup scheme; need a more global view
+  if ((defined $contents_fixups{$nodetitle})
+      && (scalar(@section_stack) > 0))
+    { my $up_title = $section_stack[$#section_stack];
+      # hack for Python Standard Library
+      $up_title =~ s/^(Built-in|Standard) Module //g;
+      my ($up_first_word) = split(/ /, $up_title);
+      $nodetitle = "$up_first_word $nodetitle";
+    }
+
+  push @section_stack, $nodetitle;
+  # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
+
+  $he->traverse(\&process_if_child_links, 'ignore text');
+  %footnotes = ();
+  # $he->dump;
+  $he->traverse(\&process_if_footnotes, 'ignore text');
+
+  # $he->dump;
+
+  if (exists $file_index_entries{$file})
+    { %this_index_entries = %{$file_index_entries{$file}};
+      # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
+    }
+  else
+    { # print STDERR "Warning: no index entries for file $file\n";
+      %this_index_entries = (); }
+
+  if (exists $file_index_entries_broken{$file})
+    { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
+  else
+    { # print STDERR "Warning: no index entries for file $file\n";
+      @this_index_entries_broken = (); }
+
+
+  if ($he->tag() ne "html")
+    { die "Expected <HTML> at top level"; }
+  my @content = @{$he->content()};
+  if ((!ref $content[0]) or ($content[0]->tag ne "head"))
+    { $he->dump;
+      die "<HEAD> not first element of <HTML>"; }
+  if ((!ref $content[1]) or ($content[1]->tag ne "body"))
+    { $he->dump;
+      die "<BODY> not second element of <HTML>"; }
+
+  $content[1]->traverse(\&output_body);
+}
+
+# stack of things we're inside that are preventing indexing from occurring now.
+# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
+my @index_deferrers = ();
+
+sub push_or_pop_index_deferrers ( $$ )
+{ my ($tag, $startflag) = check_args(2, @_);
+  if ($startflag)
+    { push @index_deferrers, $tag; }
+  else
+    { my $old_deferrer = pop @index_deferrers;
+      if ($tag ne $old_deferrer)
+	{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
+      do_deferred_index_entries(); }
+}
+
+
+sub label_add_index_entries ( $;$ )
+{ my ($label, $he) = check_args_range(1, 2, @_);
+  # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
+  # $he is the anchor element
+  if (exists $this_index_entries{$label})
+    { push @deferred_index_entries, @{$this_index_entries{$label}};
+      return; }
+
+  if ($label eq $l2h_broken_link_name)
+    { # Try to find some text to use in guessing which links should point here
+      # I should probably only look at the previous element, or if that is
+      # all punctuation, the one before it; collecting all the previous texts
+      # is a bit of overkill.
+      my @anchor_texts = collect_texts($he);
+      my @previous_texts = collect_texts($he->parent, $he);
+      # 4 elements is arbitrary; ought to filter out punctuation and small words
+      # first, then perhaps keep fewer.  Perhaps also filter out formatting so
+      # that we can see a larger chunk of text?  (Probably not.)
+      # Also perhaps should do further chunking into words, in case the
+      # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
+      my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
+
+      my $guessed = 0;
+      for my $text (@candidate_texts)
+	{ # my $orig_text = $text;
+	  if ($text =~ /^[\"\`\'().?! ]*$/)
+	    { next; }
+	  if (length($text) <= 2)
+	    { next; }
+	  # hack for Python manual; maybe defer until failure first time around?
+	  $text =~ s/^sys\.//g;
+	  for my $iterm (@this_index_entries_broken)
+	    { # I could test for zero:  LaTeX2HTML's failures in the Python
+	      # documentation are only for items of the form "... (built-in...)"
+	      if (index($iterm, $text) != -1)
+		{ push @deferred_index_entries, $iterm;
+		  # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
+		  $guessed = 1;
+		} } }
+      if (!$guessed)
+	{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
+	}
+    }
+}
+
+
+# Need to add calls to this at various places.
+# Perhaps add HTML::Element argument and do the check for appropriateness
+# here (ie, no action if inside <H1>, etc.).
+sub do_deferred_index_entries ()
+{ check_args(0, @_);
+  if ((scalar(@deferred_index_entries) > 0)
+      && (scalar(@index_deferrers) == 0))
+    { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
+      @deferred_index_entries = (); }
+}
+
+my $table_columns;		# undefined if not in a table
+my $table_first_column;		# boolean
+
+sub output_body ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+
+  if (!ref $he)
+    { my $space_index = index($he, " ");
+      if ($space_index != -1)
+	{ # Why does
+	  #   print TEXI texi_quote(substr($he, 0, $space_index+1));
+	  # give:  Can't locate object method "TEXI" via package "texi_quote"
+	  # (Because the definition texi_quote hasn't been seen yet.)
+	  print TEXI &texi_quote(substr($he, 0, $space_index+1));
+	  do_deferred_index_entries();
+	  print TEXI &texi_quote(substr($he, $space_index+1)); }
+      else
+	{ print TEXI &texi_quote($he); }
+      return; }
+
+  my $tag = $he->tag();
+
+  # Ordinary text markup first
+  if (exists $inline_markup{$tag})
+    { if ($startflag)
+	{ print TEXI "\@$inline_markup{$tag}\{"; }
+      else
+	{ print TEXI "\}"; } }
+  elsif ($tag eq "a")
+    { my ($name, $href, @content) = anchor_info($he);
+      if (!$href)
+	{ # This anchor is only here for indexing/cross referencing purposes.
+	  if ($startflag)
+	    { label_add_index_entries($name, $he); }
+	}
+      elsif ($href =~ "^(ftp|http|news):")
+	{ if ($startflag)
+	    { # Should avoid second argument if it's identical to the URL.
+	      print TEXI "\@uref\{$href, "; }
+	  else
+	    { print TEXI "\}"; }
+	}
+      elsif ($href =~ /^\#(foot[0-9]+)$/)
+	{ # Footnote
+	  if ($startflag)
+	    { # Could double-check name and content, but I'm not
+	      # currently storing that information.
+	      print TEXI "\@footnote\{";
+	      $footnotes{$1}->traverse(\&output_body);
+	      print TEXI "\}";
+	      return 0; } }
+      else
+	{ if ($startflag)
+	    { $he->dump;
+	      warn "Can't deal with internal HREF anchors yet"; }
+	}
+    }
+  elsif ($tag eq "br")
+    { print TEXI "\@\n"; }
+  elsif ($tag eq "body")
+    { }
+  elsif ($tag eq "center")
+    { if (has_single_content_string($he)
+	  && ($ {$he->content}[0] =~ /^ *$/))
+	{ return 0; }
+      if ($startflag)
+	{ print TEXI "\n\@center\n"; }
+      else
+	{ print TEXI "\n\@end center\n"; }
+    }
+  elsif ($tag eq "div")
+    { my $align = $he->attr('align');
+      if (defined($align) && ($align eq "center"))
+	{ if (has_single_content_string($he)
+	      && ($ {$he->content}[0] =~ /^ *$/))
+	    { return 0; }
+	  if ($startflag)
+	    { print TEXI "\n\@center\n"; }
+	  else
+	    { print TEXI "\n\@end center\n"; } }
+    }
+  elsif ($tag eq "dl")
+    { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
+      if (has_single_content_with_tag($he, "dd"))
+	{ my $he_dd = $ {$he->content}[0];
+	  if (has_single_content_with_tag($he_dd, "pre"))
+	    { my $he_pre = $ {$he_dd->content}[0];
+	      print_pre($he_pre);
+	      return 0; } }
+      if ($startflag)
+	{ # Could examine the elements, to be cleverer about formatting.
+	  # (Also to use ftable, vtable...)
+	  print TEXI "\n\@table \@asis\n"; }
+      else
+	{ print TEXI "\n\@end table\n"; }
+    }
+  elsif ($tag eq "dt")
+    { push_or_pop_index_deferrers($tag, $startflag);
+      if ($startflag)
+	{ print TEXI "\n\@item "; }
+      else
+	{ } }
+  elsif ($tag eq "dd")
+    { if ($startflag)
+	{ print TEXI "\n"; }
+      else
+	{ }
+      if (scalar(@index_deferrers) != 0)
+	{ $he->dump;
+	  die "index deferrers: ", join(" ", @index_deferrers); }
+      do_deferred_index_entries();
+    }
+  elsif ($tag =~ /^(font|big|small)$/)
+    { # Do nothing for now.
+    }
+  elsif ($tag =~ /^h[1-6]$/)
+    { # We don't need this because we never recursively enter the heading content.
+      # push_or_pop_index_deferrers($tag, $startflag);
+      my $secname = "";
+      my @seclabels = ();
+      for my $elt (@{$he->content})
+	{ if (!ref $elt)
+	    { $secname .= $elt; }
+	  elsif ($elt->tag eq "br")
+	    { }
+	  elsif ($elt->tag eq "a")
+	    { my ($name, $href, @acontent) = anchor_info($elt);
+              if ($href)
+                { $he->dump;
+                  $elt->dump;
+                  die "Nonsimple anchor in <$tag>"; }
+	      if (!defined $name)
+		{ die "No NAME for anchor in $tag"; }
+	      push @seclabels, $name;
+	      for my $subelt (@acontent)
+		{ $secname .= html_to_texi($subelt); } }
+	  else
+	    { $secname .= html_to_texi($elt); } }
+      if ($secname eq "")
+	{ die "No section name in <$tag>"; }
+      if (scalar(@section_stack) == 1)
+	{ if ($section_stack[-1] ne "Top")
+	    { die "Not top? $section_stack[-1]"; }
+	  print TEXI "\@settitle $secname\n";
+	  print TEXI "\@c %**end of header\n";
+	  print TEXI "\n";
+	  print TEXI "\@node Top\n";
+	  print TEXI "\n"; }
+      else
+	{ print TEXI "\n\@node $section_stack[-1]\n";
+	  print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
+      for my $seclabel (@seclabels)
+	{ label_add_index_entries($seclabel); }
+      # This should only happen once per file.
+      label_add_index_entries("");
+      if (scalar(@index_deferrers) != 0)
+	{ die "index deferrers: ", join(" ", @index_deferrers); }
+      do_deferred_index_entries();
+      return 0;
+    }
+  elsif ($tag eq "hr")
+    { }
+  elsif ($tag eq "ignore")
+    { # Hack for ignored elements
+      return 0;
+    }
+  elsif ($tag eq "li")
+    { if ($startflag)
+	{ print TEXI "\n\n\@item\n";
+	  do_deferred_index_entries(); } }
+  elsif ($tag eq "ol")
+    { if ($startflag)
+	{ print TEXI "\n\@enumerate \@bullet\n"; }
+      else
+	{ print TEXI "\n\@end enumerate\n"; } }
+  elsif ($tag eq "p")
+    { if ($startflag)
+	{ print TEXI "\n\n"; }
+      if (scalar(@index_deferrers) != 0)
+	{ die "index deferrers: ", join(" ", @index_deferrers); }
+      do_deferred_index_entries(); }
+  elsif ($tag eq "pre")
+    { print_pre($he);
+      return 0; }
+  elsif ($tag eq "table")
+    { # Could also indicate common formatting for first column, or
+      # determine relative widths for columns (or determine a prototype row)
+      if ($startflag)
+	{ if (defined $table_columns)
+	    { $he->dump;
+	      die "Can't deal with table nested inside $table_columns-column table"; }
+	  $table_columns = table_columns($he);
+	  if ($table_columns < 2)
+	    { $he->dump;
+	      die "Column with $table_columns columns?"; }
+	  elsif ($table_columns == 2)
+	    { print TEXI "\n\@table \@asis\n"; }
+	  else
+	    { print TEXI "\n\@multitable \@columnfractions";
+	      for (my $i=0; $i<$table_columns; $i++)
+		{ print TEXI " ", 1.0/$table_columns; }
+	      print TEXI "\n"; } }
+      else
+	{ if ($table_columns == 2)
+	    { print TEXI "\n\@end table\n"; }
+	  else
+	    { print TEXI "\n\@end multitable\n"; }
+	  undef $table_columns; } }
+  elsif (($tag eq "td") || ($tag eq "th"))
+    { if ($startflag)
+	{ if ($table_first_column)
+	    { print TEXI "\n\@item ";
+	      $table_first_column = 0; }
+	  elsif ($table_columns > 2)
+	    { print TEXI "\n\@tab "; } }
+      else
+	{ print TEXI "\n"; } }
+  elsif ($tag eq "tr")
+    { if ($startflag)
+	{ $table_first_column = 1; } }
+  elsif ($tag eq "ul")
+    { if ($startflag)
+	{ print TEXI "\n\@itemize \@bullet\n"; }
+      else
+	{ print TEXI "\n\@end itemize\n"; } }
+  else
+    { print STDERR "\nBailing out\n";
+      $he->dump;
+      return 0; }
+
+  return 1;
+}
+
+sub print_pre ( $ )
+{ my ($he_pre) = check_args(1, @_);
+  if (!has_single_content_string($he_pre))
+    { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
+  my $pre_content = $ {$he_pre->content}[0];
+  print TEXI "\n\@example";
+  print TEXI &texi_quote($pre_content);
+  print TEXI "\@end example\n";
+}
+
+sub table_columns ( $ )
+{ my ($table) = check_args(1, @_);
+  my $result = 0;
+  for my $row (@{$table->content})
+    { if ($row->tag ne "tr")
+	{ $table->dump;
+	  $row->dump;
+	  die "Expected <TR> as table row."; }
+      $result = max($result, scalar(@{$row->content})); }
+  return $result;
+}
+
+
+###########################################################################
+### Utilities
+###
+
+sub min ( $$ )
+{ my ($x, $y) = check_args(2, @_);
+  return ($x < $y) ? $x : $y;
+}
+
+sub max ( $$ )
+{ my ($x, $y) = check_args(2, @_);
+  return ($x > $y) ? $x : $y;
+}
+
+sub file_to_tree ( $ )
+{ my ($file) = check_args(1, @_);
+
+  my $tree = new HTML::TreeBuilder;
+  $tree->ignore_unknown(1);
+  # $tree->warn(1);
+  $tree->parse_file($file);
+  cleanup_parse_tree($tree);
+  return $tree
+}
+
+
+sub has_single_content ( $ )
+{ my ($he) = check_args(1, @_);
+  if (!ref $he)
+    { # return 0;
+      die "Non-reference argument: $he"; }
+  my $ref_content = $he->content;
+  if (!defined $ref_content)
+    { return 0; }
+  my @content = @{$ref_content};
+  if (scalar(@content) != 1)
+    { return 0; }
+  return 1;
+}
+
+
+# Return true if the content of the element contains only one element itself,
+# and that inner element has the specified tag.
+sub has_single_content_with_tag ( $$ )
+{ my ($he, $tag) = check_args(2, @_);
+  if (!has_single_content($he))
+    { return 0; }
+  my $content = $ {$he->content}[0];
+  if (!ref $content)
+    { return 0; }
+  my $content_tag = $content->tag;
+  if (!defined $content_tag)
+    { return 0; }
+  return $content_tag eq $tag;
+}
+
+sub has_single_content_string ( $ )
+{ my ($he) = check_args(1, @_);
+  if (!has_single_content($he))
+    { return 0; }
+  my $content = $ {$he->content}[0];
+  if (ref $content)
+    { return 0; }
+  return 1;
+}
+
+
+# Return name, href, content.  First two may be undefined; third is an array.
+# I don't see how to determine if there are more attributes.
+sub anchor_info ( $ )
+{ my ($he) = check_args(1, @_);
+  if ($he->tag ne "a")
+    { $he->dump;
+      die "passed non-anchor to anchor_info"; }
+  my $name = $he->attr('name');
+  my $href = $he->attr('href');
+  my @content = ();
+  { my $ref_content = $he->content;
+    if (defined $ref_content)
+      { @content = @{$ref_content}; } }
+  return ($name, $href, @content);
+}
+
+
+sub texi_quote ( $ )
+{ my ($text) = check_args(1, @_);
+  $text =~ s/([\@\{\}])/\@$1/g;
+  $text =~ s/ -- / --- /g;
+  return $text;
+}
+
+# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
+sub texi_remove_punctuation ( $ )
+{ my ($text) = check_args(1, @_);
+
+  $text =~ s/^ +//g;
+  $text =~ s/[ :]+$//g;
+  $text =~ s/^[1-9][0-9.]* +//g;
+  $text =~ s/,//g;
+  # Both embedded colons and " -- " confuse makeinfo.  (Perhaps " -- "
+  # gets converted into " - ", just as "---" would be converted into " -- ",
+  # so the names end up differing.)
+  # $text =~ s/:/ -- /g;
+  $text =~ s/://g;
+  return $text;
+}
+
+
+## Do not use this inside `traverse':  it throws off the traversal.  Use
+## html_replace_by_ignore or html_replace_by_meta instead.
+# Returns 1 if success, 0 if failure.
+sub html_remove ( $;$ )
+{ my ($he, $parent) = check_args_range(1, 2, @_);
+  if (!defined $parent)
+    { $parent = $he->parent; }
+  my $ref_pcontent = $parent->content;
+  my @pcontent = @{$ref_pcontent};
+  for (my $i=0; $i<scalar(@pcontent); $i++)
+    { if ($pcontent[$i] eq $he)
+	{ splice @{$ref_pcontent}, $i, 1;
+	  $he->parent(undef);
+	  return 1; } }
+  die "Didn't find $he in $parent";
+}
+
+
+sub html_replace ( $$;$ )
+{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
+  if (!defined $parent)
+    { $parent = $orig->parent; }
+  my $ref_pcontent = $parent->content;
+  my @pcontent = @{$ref_pcontent};
+  for (my $i=0; $i<scalar(@pcontent); $i++)
+    { if ($pcontent[$i] eq $orig)
+	{ $ {$ref_pcontent}[$i] = $new;
+	  $new->parent($parent);
+	  $orig->parent(undef);
+	  return 1; } }
+  die "Didn't find $orig in $parent";
+}
+
+sub html_replace_by_meta ( $;$ )
+{ my ($orig, $parent) = check_args_range(1, 2, @_);
+  my $meta = new HTML::Element "meta";
+  if (!defined $parent)
+    { $parent = $orig->parent; }
+  return html_replace($orig, $meta, $parent);
+}
+
+sub html_replace_by_ignore ( $;$ )
+{ my ($orig, $parent) = check_args_range(1, 2, @_);
+  my $ignore = new HTML::Element "ignore";
+  if (!defined $parent)
+    { $parent = $orig->parent; }
+  return html_replace($orig, $ignore, $parent);
+}
+
+
+
+###
+### Collect text elements
+###
+
+my @collected_texts;
+my $collect_texts_stoppoint;
+my $done_collecting;
+
+sub collect_texts ( $;$ )
+{ my ($root, $stop) = check_args_range(1, 2, @_);
+  # print STDERR "collect_texts: $root $stop\n";
+  $collect_texts_stoppoint = $stop;
+  $done_collecting = 0;
+  @collected_texts = ();
+  $root->traverse(\&collect_if_text); # process texts
+  # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
+  return @collected_texts;
+}
+
+sub collect_if_text ( $$$ )
+{ my $he = (check_args(3, @_))[0]; #  ignore depth and startflag arguments
+  if ($done_collecting)
+    { return 0; }
+  if (!defined $he)
+    { return 0; }
+  if (!ref $he)
+    { push @collected_texts, $he;
+      return 0; }
+  if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
+    { $done_collecting = 1;
+      return 0; }
+  return 1;
+}
+
+
+###########################################################################
+### Clean up parse tree
+###
+
+sub cleanup_parse_tree ( $ )
+{ my ($he) = check_args(1, @_);
+  $he->traverse(\&delete_if_navigation, 'ignore text');
+  $he->traverse(\&delete_extra_spaces, 'ignore text');
+  $he->traverse(\&merge_dl, 'ignore text');
+  return $he;
+}
+
+
+## Simpler version that deletes contents but not the element itself.
+# sub delete_if_navigation ( $$$ )
+# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
+#   if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
+#     { $he->delete();
+#       return 0; }
+#   else
+#     { return 1; }
+# }
+
+sub delete_if_navigation ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
+    { my $ref_pcontent = $he->parent()->content();
+      # Don't try to modify @pcontent, which appears to be a COPY.
+      # my @pcontent = @{$ref_pcontent};
+      for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
+	{ if (${$ref_pcontent}[$i] eq $he)
+	    { splice(@{$ref_pcontent}, $i, 1);
+	      last; } }
+      $he->delete();
+      return 0; }
+  else
+    { return 1; }
+}
+
+sub delete_extra_spaces ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  my $tag = $he->tag;
+  if ($tag =~ /^(head|html|table|tr|ul)$/)
+    { delete_child_spaces($he); }
+  delete_trailing_spaces($he);
+  return 1;
+}
+
+
+sub delete_child_spaces ( $ )
+{ my ($he) = check_args(1, @_);
+  my $ref_content = $he->content();
+  for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
+    { if ($ {$ref_content}[$i] =~ /^ *$/)
+	{ splice(@{$ref_content}, $i, 1);
+	  $i--; } }
+}
+
+sub delete_trailing_spaces ( $ )
+{ my ($he) = check_args(1, @_);
+  my $ref_content = $he->content();
+  if (! defined $ref_content)
+    { return; }
+  # Could also check for previous element = /^h[1-6]$/.
+  for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
+    { if ($ {$ref_content}[$i] =~ /^ *$/)
+	{ my $next_elt = $ {$ref_content}[$i+1];
+	  if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
+	    { splice(@{$ref_content}, $i, 1);
+	      $i--; } } }
+  if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
+    { my $last_elt = $ {$ref_content}[$#{$ref_content}];
+      if ((defined $last_elt) && ($last_elt =~ /^ *$/))
+	{ pop @{$ref_content}; } }
+}
+
+
+# If we find a paragraph that looks like
+# <P>
+#   <HR>
+#   <UL>
+# then accumulate its links into a contents_list and delete the paragraph.
+sub process_if_child_links ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  if ($he->tag() eq "p")
+    { my $ref_content = $he->content();
+      if (defined $ref_content)
+	{ my @content = @{$ref_content};
+	  if ((scalar(@content) == 2)
+	      && (ref $content[0]) && $content[0]->tag() eq "hr"
+	      && (ref $content[1]) && $content[1]->tag() eq "ul")
+	    { process_child_links($he);
+	      $he->delete();
+	      return 0; } } }
+  return 1;
+}
+
+
+# If we find
+#     <H4>
+#       "Footnotes"
+#     <DL>
+#       <DT>
+#         <A NAME="foot560">
+#           "...borrow"
+#         <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
+#           "1.2"
+#       <DD>
+#         "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
+#       ...
+# then record the footnote information and delete the section and list.
+
+my $process_if_footnotes_expect_dl_next = 0;
+
+sub process_if_footnotes ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  if (($he->tag() eq "h4")
+      && has_single_content_string($he)
+      && ($ {$he->content}[0] eq "Footnotes"))
+    { html_replace_by_ignore($he);
+      $process_if_footnotes_expect_dl_next = 1;
+      return 0; }
+
+  if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
+    { my $ref_content = $he->content();
+      if (defined $ref_content)
+	{ $process_if_footnotes_expect_dl_next = 0;
+	  my @content = @{$ref_content};
+	  for (my $i=0; $i<$#content; $i+=2)
+	    { my $he_dt = $content[$i];
+	      my $he_dd = $content[$i+1];
+	      if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
+		{ $he->dump;
+		  die "expected <DT> and <DD> at positions $i and ", $i+1; }
+	      my @dt_content = @{$he_dt->content()};
+	      if ((scalar(@dt_content) != 2)
+		  || ($dt_content[0]->tag ne "a")
+		  || ($dt_content[1]->tag ne "a"))
+		{ $he_dt->dump;
+		  die "Expected 2 anchors as content of <DT>"; }
+	      my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
+	      my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
+	      # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
+	      if ($dt1_name ne $dt2_name)
+		{ $he_dt->dump;
+		  die "Expected identical names for anchors"; }
+	      html_replace_by_ignore($he_dd);
+	      $he_dd->tag("div"); # has no effect
+	      $footnotes{$dt1_name} = $he_dd; }
+	  html_replace_by_ignore($he);
+	  return 0; } }
+
+  if ($process_if_footnotes_expect_dl_next)
+    { $he->dump;
+      die "Expected <DL> for footnotes next"; }
+
+  return 1;
+}
+
+
+
+## Merge two adjacent paragraphs containing <DL> items, such as:
+#     <P>
+#       <DL>
+#         <DT>
+#           ...
+#         <DD>
+#           ...
+#     <P>
+#       <DL>
+#         <DT>
+#           ...
+#         <DD>
+#           ...
+
+sub merge_dl ( $$$ )
+{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
+  if (!$startflag)
+    { return; }
+
+  my $ref_content = $he->content;
+  if (!defined $ref_content)
+    { return; }
+  my $i = 0;
+  while ($i < scalar(@{$ref_content})-1)
+    { my $p1 = $ {$ref_content}[$i];
+      if ((ref $p1) && ($p1->tag eq "p")
+	  && has_single_content_with_tag($p1, "dl"))
+	{ my $dl1 = $ {$p1->content}[0];
+	  # In this loop, rhs, not lhs, of < comparison changes,
+	  # because we are removing elements from the content of $he.
+	  while ($i < scalar(@{$ref_content})-1)
+	    { my $p2 = $ {$ref_content}[$i+1];
+	      if (!((ref $p2) && ($p2->tag eq "p")
+		    && has_single_content_with_tag($p2, "dl")))
+		{ last; }
+	      # Merge these two elements.
+	      splice(@{$ref_content}, $i+1, 1); # remove $p2
+	      my $dl2 = $ {$p2->content}[0];
+	      $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
+	    }
+	  # extra increment because next element isn't a candidate for $p1
+	  $i++; }
+      $i++; }
+  return 1;
+}
+
+
+
+###########################################################################
+### Testing
+###
+
+sub test ( $$ )
+{ my ($action, $file) = check_args(2, @_);
+
+  # General testing
+  if (($action eq "view") || ($action eq ""))
+    { # # $file = "/homes/gws/mernst/www/links.html";
+      # # $file = "/homes/gws/mernst/www/index.html";
+      # # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
+      # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
+      # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
+      # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
+      my $tree = file_to_tree($file);
+
+      ## Testing
+      # print STDERR $tree->as_HTML;
+      $tree->dump();
+
+      # print STDERR $tree->tag(), "\n";
+      # print STDERR @{$tree->content()}, "\n";
+      # 
+      # for (@{ $tree->extract_links(qw(a img)) }) {
+      #   my ($link, $linkelem) = @$_;
+      #   print STDERR "$link ", $linkelem->as_HTML;
+      #   }
+      # 
+      # print STDERR @{$tree->extract_links()}, "\n";
+
+      # my @top_level_elts = @{$tree->content()};
+
+      # if scalar(@{$tree->content()})
+      return;
+    }
+
+  elsif ($action eq "raw")
+    { my $tree = new HTML::TreeBuilder;
+      $tree->ignore_unknown(1);
+      # $tree->warn(1);
+      $tree->parse_file($file);
+
+      $tree->dump();
+
+      # cleanup_parse_tree($tree);
+      # $tree->dump();
+      return;
+    }
+
+  # Test dealing with a section.
+  elsif ($action eq "section")
+    { # my $file;
+      # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
+      # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
+      # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
+      process_section_file($file, 0, "Title");
+    }
+
+  # Test dealing with many sections
+  elsif (0)
+    { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/api.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
+		   # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/front.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
+		   # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/index.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/number.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/object.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/os.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/types.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
+		   "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
+      for my $file (@files)
+	{ print STDERR "\n", "=" x 75, "\n", "$file:\n";
+	  process_section_file($file, 0, "Title");
+	}
+    }
+
+  # Test dealing with index.
+  elsif ($action eq "index")
+    { # my $file;
+      # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
+
+      process_index_file($file, "\@cindex");
+      print_index_info();
+    }
+
+  else
+    { die "Unrecognized action `$action'"; }
+}
+
+
+###########################################################################
+### Main loop
+###
+
+sub process_contents_file ( $ )
+{ my ($file) = check_args(1, @_);
+
+  # could also use File::Basename
+  my $info_file = $file;
+  $info_file =~ s/(\/?index)?\.html$//;
+  if ($info_file eq "")
+    { chomp($info_file = `pwd`); }
+  $info_file =~ s/^.*\///;	# not the most efficient way to remove dirs
+
+  $html_directory = $file;
+  $html_directory =~ s/(\/|^)[^\/]+$/$1/;
+
+  my $texi_file = "$info_file.texi";
+  open(TEXI, ">$texi_file");
+
+  print TEXI "\\input texinfo   \@c -*-texinfo-*-\n";
+  print TEXI "\@c %**start of header\n";
+  print TEXI "\@setfilename $info_file\n";
+
+  # 2. Summary Description and Copyright
+  #      The "Summary Description and Copyright" segment describes the
+  #      document and contains the copyright notice and copying permissions
+  #      for the Info file.  The segment must be enclosed between `@ifinfo'
+  #      and `@end ifinfo' commands so that the formatters place it only in
+  #      the Info file.
+  # 
+  # The summary description and copyright segment does not appear in the
+  # printed document.
+  # 
+  #      @ifinfo
+  #      This is a short example of a complete Texinfo file.
+  #      
+  #      Copyright @copyright{} 1990 Free Software Foundation, Inc.
+  #      @end ifinfo
+
+
+  # 3. Title and Copyright
+  #      The "Title and Copyright" segment contains the title and copyright
+  #      pages and copying permissions for the printed manual.  The segment
+  #      must be enclosed between `@titlepage' and `@end titlepage'
+  #      commands.  The title and copyright page appear only in the printed
+  #      manual.
+  # 
+  # The titlepage segment does not appear in the Info file.
+  # 
+  #      @titlepage
+  #      @sp 10
+  #      @comment The title is printed in a large font.
+  #      @center @titlefont{Sample Title}
+  #      
+  #      @c The following two commands start the copyright page.
+  #      @page
+  #      @vskip 0pt plus 1filll
+  #      Copyright @copyright{} 1990 Free Software Foundation, Inc.
+  #      @end titlepage
+
+
+  # 4. `Top' Node and Master Menu
+  #      The "Master Menu" contains a complete menu of all the nodes in the
+  #      whole Info file.  It appears only in the Info file, in the `Top'
+  #      node.
+  # 
+  # The `Top' node contains the master menu for the Info file.  Since a
+  # printed manual uses a table of contents rather than a menu, the master
+  # menu appears only in the Info file.
+  # 
+  #      @node    Top,       First Chapter, ,         (dir)
+  #      @comment node-name, next,          previous, up
+  # 
+  #      @menu
+  #      * First Chapter::    The first chapter is the
+  #                           only chapter in this sample.
+  #      * Concept Index::    This index has two entries.
+  #      @end menu
+
+
+
+  $current_ref_tdf = [ "Top", 0, $ARGV[0] ];
+  process_section_file($file, 0, "Top");
+  while (scalar(@contents_list))
+  { $current_ref_tdf = shift @contents_list;
+    process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
+  }
+
+  print TEXI "\n";
+  for my $indextitle (@index_titles)
+    { print TEXI "\@node $indextitle\n";
+      print TEXI "\@unnumbered $indextitle\n";
+      print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
+      print TEXI "\n"; }
+
+  print TEXI "\@contents\n";
+  print TEXI "\@bye\n";
+  close(TEXI);
+}
+
+# This needs to be last so global variable initializations are reached.
+
+if (scalar(@ARGV) == 0)
+{ die "No arguments supplied to html2texi.pl"; }
+
+if ($ARGV[0] eq "-test")
+{ my @test_args = @ARGV[1..$#ARGV];
+  if (scalar(@test_args) == 0)
+    { test("", "index.html"); }
+  elsif (scalar(@test_args) == 1)
+    { test("", $test_args[0]); }
+  elsif (scalar(@test_args) == 2)
+    { test($test_args[0], $test_args[1]); }
+  else
+    { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
+  exit();
+}
+
+if (scalar(@ARGV) != 1)
+{ die "Pass one argument, the main/contents page"; }
+
+process_contents_file($ARGV[0]);
author	Fred Drake <fdrake@acm.org>	1999-01-08 15:25:29 (GMT)
committer	Fred Drake <fdrake@acm.org>	1999-01-08 15:25:29 (GMT)
commit	3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8 (patch)
tree	0074eae07130a12e34af18ec6c354367f653c19e
parent	41498435ba1f2901e2e3498d7f7f5c773a602568 (diff)
download	cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.zip cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.gz cpython-3fe1d326932432b1c27b75e9d3ce79f91dcfe1f8.tar.bz2