#!/usr/bin/perl -w
# Exgen 1.0 -- A script to generate an XML parser from an XML document instance



###############################################################################
# Module Name     : exgen.pl
# Version 	  : 1.0
# Description     : This application contains all of the subroutines necessary
#		  : generates a perl EXPAT parser from an XML file
# Author          : Jeremy Ellman
#
#	Copyright (c) 2004, Jeremy Ellman. All Rights Reserved.
#	This module is free software. It may be used, redistributed
#	and/or modified under the terms of the Perl Artistic License
#     (see http://www.perl.com/perl/misc/Artistic.html)
# 

################ Change History #################
#
# 10/12/03: Re-written parser to use EXPAT rather than twig parser
#
################End Change History####################################

#
use strict;
use utf8;
use Getopt::Long; 
#use Data::Dumper;
#print Dumper($reference);

my %tags = ();		#

my $debug = 0;

#
sub main {
	my ($line, $first, $rest);
	my $count = 0;

	my $infile; 
	my $outfile;
	
	GetOptions(
		'xmldoc=s' => \$infile, 
		'parser=s' => \$outfile, 
		'debug' => sub { $debug = 1 });    
		

	die "exgen --infile <file> --outfile <perl file to generate> [--debug]\n\n Stopping "
		unless defined $infile && defined $outfile && $infile && $outfile;	
	
	print "IN: $infile, OUT: $outfile, DEBUG: $debug\n";
	my $debug_mode = 1;
	open (IN, $infile ) || die "Can't read $infile";
	open (OUT, ">" . $outfile) || die "Can't write $outfile";
	&XML_read();
	&XML_write();
	$infile = quotemeta $infile;
	print OUT "\n\n&run_generated_parser( \"$infile\") if \$debug;";
}

# Read an XML file instance noting tags and their attributes
#
sub XML_read {
	my $line;
	my $first;
	my $count = 0;	
	
	while ($line = <IN>) 	{
		$line =~ s/[\n\r]/ /g;
		print "$count: >>$line<<\n" if $debug;
		$count++;

		while ($line =~ /^.*?<(.+?)>(.*)/) {
			print "Found: $1 (rest: >>$2<<)\n" if $debug;
			$first = $1;
			&process_tag( $first );
						
			if (defined $2) {
				$line = $2;
			}
			else {
				last;
			}
		}
	}
	print "$count lines processed\n";
}

# Process a tag. We can automatically generate close </...> tags, so we're only 
# interested in open tags. For these, we need to check for attributes...
#
sub  process_tag {
	my $tag_line = shift;
	my $tag;
	my $rest;
	my $tag_value = {} ;

	$tag_line =~ /^([\w_]+)\W+(.*)/;

		$tag = $1;
		$rest  = $2;

		if ($tag =~ /^\w/) { # it's an open tag
			print "$tag STORED\n" if $debug;
			my $tv = $tags{ $tag };
			$tag_value = $tv  if defined $tv;	# get old tags value
			
			while ($rest =~ /^\s*(\w+).*?=\s*\"(.+?)\"(.*)/) {
				my $attribute = $1;
				print "Attribute: $attribute\n" if $debug;
				$tag_value->{ $attribute } = 1; 
				$rest = $3;
			}
			$tags{ $tag } = $tag_value;
#			print Dumper(\%tags);
		}
}

# Write out the Subs, an open and a close for each XML tag.
#
sub XML_write {
	#	my $file = shift;
	&print_header ();
	for my $tag (sort keys %tags) {
		print "Processing: $tag\n" if $debug;
		&print_open_callback( $tag );
		&print_close_callback( $tag );
	}
	
}

# Write out the parser headings.

sub print_header {
    print OUT "# Generated (in part) by Jeremy Ellman's EXGen\n";
    print OUT "# -- for efficiency, delete the callbacks you don't need\n\n";
    print OUT "use XML::Parser;\nuse strict;\n";
    print OUT "\n\nmy \$debug	= 1;\nmy \$text = \"\";\n";

    print OUT 'sub run_generated_parser {
	my ($infile)	= @_; 	# select parse call backs
	my $parser 		= new XML::Parser(\'Style\' => \'Subs\');
	$parser->setHandlers(Char    => \\&text_handler);

	if ($parser->parsefile( $infile )){
		print "Parse Completed\n";
	}
	else {
		if ($@ =~ /line\\s*(\\d+),/) {
			print "Bad_Data: >>$1<< \n";
		}
		else {
			print "Illegal XML file: $infile \n";
		}
	}
}


sub text_handler {
	my ($Expat, $String) = @_;
	$text .= $String if $String =~ /\\w/;
}';
}

# Write the OPEN sub for the XML tag
#
sub print_open_callback {
	my $tag = shift;
	print "OPEN: $tag\n" if $debug;

	print OUT "\n\nsub $tag {
	my( \$expat, \$element, \%", "$tag", "_hh) = \@_;
	my \$$tag = \\\%", "$tag", "_hh;\n
	print \"\\nopen: $tag\\n\" if \$debug;
	";
	my $attributes = $tags{ $tag };
	if (defined $attributes) {
		for my $attribute (sort (keys %{ $attributes })) {
			print "ATTR:  $attribute\n" if $debug;
			print OUT "\n\tmy \$$attribute = \$$tag->\{ $attribute \};"; 
		}
		for my $attribute (sort (keys %{ $attributes })) {
			print OUT "\n\tprint \"ATTRIBUTE ($tag) $attribute: \$$attribute\\n\" if \$debug && defined \$$attribute;";
		}
	}

	print OUT "\n}\n";
}

sub print_close_callback {
	my $tag = shift;
	print "CLOSE: $tag\n" if $debug;
	
	print OUT "\n\nsub $tag", "_ {
	my( \$expat, \$element) = \@_;
	print \"close: $tag\", ((\$text) ? \", Text: \\\"\$text\\\"\\n\" : \"\\n\") if \$debug;
	\$text = \"\";

}"
}

&main();

1;

=head1 NAME

Exgen 1.0 -- A script to generate an XML parser from an XML document instance

=head1 DESCRIPTION

Exgen takes an XML document instance and creates a 'stubs' style XML parser from 
it. This parser may then be adapted as necessary. This greatly reduces the work 
needed in to create an XML parser. Exgen is also useful where there is no access 
to the relevant DTD or schema.

The design objective of Exgen is that it is less tedious to delete callbacks 
that are not needed than to generate them by hand. 

Exgen may also be a help for novice XML parser writers.

=head1 Synopsis

perl -w exgen_1_0.pl --xmldoc myxml.xml --parser myparser.pl

perl -w myparser.pl myxml.xml

=head1 PREREQUISITES

This script requires
Getopt::Long

=head1 COREQUISITES

The generated XML parser requires
XML::Parser

=head1 Limitations

Namespaces are not supported yet.

=head1 AUTHOR

Jeremy Ellman (jeremy at ellman dot freeserve dot co dot uk)

=pod OSNAMES
any
=cut
=pod SCRIPT CATEGORIES
CPAN/Administrative
Fun/Educational
=cut
