#!ruby

######################################################################
##
##  Programming Ruby, HTML workshop version
##  index building (uses std SGML parsing module)
##
##  $Id: anchors.rb,v 1.5 2001/11/27 15:08:49 dave Exp $
##
######################################################################

require 'sgml-parser'
require 'cgi'           # for escapeHTML method

class LinkCollector < SGMLParser

# in the following pairs, the 1st format is used for HHK (HTML help)
# style index and the 2nd one for writing to a plain HTML index file
FMT0 = [
'<HTML><HEAD>
	<!-- Sitemap 1.0 -->
</HEAD><BODY>
<UL>' ,
'<HTML><HEAD>
	<base target="resultwnd">
        <style rel=stylesheet type=text/css>
           A { text-decoration: none; font-size: 80%; font-family: sans-serif}
        </style>
</HEAD><BODY bgcolor="white">'
]

FMT1 = [
'    <LI> <OBJECT type="text/sitemap">
	<param name="Local" value="%s">
	<param name="Name" value="%s">
	</OBJECT>' ,
'<A href="%s">%s</A><br>'
]

FMT2 = [
"</UL>\n</BODY></HTML>" ,
'<!--
        <p>Book indexed by <a href="mailto:bdelmee@advalvas.be">
        Bernard Delm&eacute;e</a></p>
-->
</BODY></HTML>'
]

    def initialize( verbose=nil )
        super( verbose )
        @capturing = false
        @anchor = nil
        @cbuf = ''
        @namelist = {}
        @hreflist = {}
        @list_ptr = nil
    end

    def do_parse( filename )
        @fname = filename
        feed( open( @fname, 'r' ).read() )
        close()
    end

    # capture text between <tag> and </tag>
    def handle_data( data )
        @cbuf += data if @capturing
    end

    def toggle_capture
        str = @cbuf
        @cbuf = ''
        @capturing = ! @capturing
        str
    end

    # decide whether a link is worth keeping, and format it
    def _cleanup_entry( entry, url )
        entry.strip!
        return '' if ! entry
        # skip some nodes
        if [ 'Previous <', 'Contents ^', 'Next >', 'Index:' ].include?( entry )
            return ''
        end
        # get rid of class in "class(#|::)method" (debatable?!?)
        entry.sub!( /^\S+(#|::|\.)/, '' )
        # merge "[ ]" and "[]" entries
        entry.sub!( /\[ \]/, '[]' )
        # make text as unique as possible by suffixing it with a section name
        /\#(\S+)\./.match(url) ||
            /\S+_(\S+)\.htm/.match(url) ||
                /(\S+)\.htm/.match(url)
        section = $1

        entry += ' (' + section + ')' if section &&
                    ! section.empty? &&
                    section.downcase != entry.downcase
        entry
    end

    def _insert_entry( text, anchor, hash )
        text = _cleanup_entry( text, anchor )
        hash[ text ] = anchor if ! text.empty?
    end

    # deal with <A> tag
    def start_a( attrs )
        @list_ptr = nil
        for a, v in attrs
            if a == 'name'
                @anchor = @fname + '#' + v[1..-2]
                @list_ptr = @namelist
            elsif a == 'href'
                @anchor = v[1..-2]
                @anchor = @fname + v[1..-2] if @anchor[0,1] == '#'
                @list_ptr = @hreflist
            end
        end
        # exclude external links
        toggle_capture if @anchor and ! /^\w+:/.match(@anchor)
    end

    # deal with </A> tag
    def end_a
        return if ! @capturing
        text = toggle_capture()
        _insert_entry( text, @anchor, @list_ptr )
        @anchor = nil
    end

    # dump links list to stdout
    URL_PREFIX = 'html/'
    # URL_PREFIX = 'http://www.rubycentral.com/book/'
    def do_output( html )
        fmt_idx = html ? 1 : 0
        # merge our two hashes of collected links
        links = @namelist.clone.update(@hreflist)
        if links.empty?
            $stderr.puts( "NO hyperlink found in input!" )
            return
        end
        
        puts FMT0[fmt_idx]
        legends = links.keys.sort! { |x,y| x.upcase <=> y.upcase }
        
        if html
            output_idx_toc( legends )
            group = nil
        end
        for entry in legends
            if html # create a link per alphabetical group of topics
                initial = entry[0,1].upcase
                if initial != group
                    group = initial
                    printf '<ul><li><a name="BM%X">%s</a>',
                           initial[0], CGI.escapeHTML( initial )
                    print '&nbsp;<a href="#idx_top" target="_self">^</a>'
                    puts  '</li></ul>'
                end
            end
            printf FMT1[fmt_idx], URL_PREFIX + links[ entry ], CGI.escapeHTML( entry )
            puts
        end
        puts FMT2[fmt_idx]
        $stderr.puts( "Collected #{@namelist.length} names" )
        $stderr.puts( "Collected #{@hreflist.length} hrefs" )
        $stderr.puts( "Wrote #{links.length} entries from merged lists" )
    end

    # create alphabetical shortcut list to topic groups
    def output_idx_toc( topics )
        # create list of initials
        h = {}
        for entry in topics
            h[ entry[0,1].upcase ] = nil
        end
        puts '<p id="idx_top"><center>'
        puts '<table border="1" cols="8">'
        puts '<caption><strong>jump targets</strong></caption>'
        i = 0
        for entry in h.keys.sort!
            puts '<tr>' if i % 8 == 0
            printf '%s<td><a href="#BM%X" target="_self">%s</a></td>%s',
                   "\t", entry[0], CGI.escapeHTML( entry ), "\n"
            i += 1
            puts '</tr>' if i % 8 == 0
        end
        puts '</tr>' if i % 8 > 0
        puts '</table></center></p>'
    end

end	# of class

######################################################################

if ARGV.empty?
    puts "usage: anchors.rb [-hhk] filename.html..."
    exit(1)
elsif ARGV[0] == '-hhk'
    plain_html = false
    ARGV.shift
else
    plain_html = true
end

p = LinkCollector.new
for argv in ARGV
    # GC.disable
    p.do_parse( argv )
    # GC.enable
end
p.do_output( plain_html )
