from xmlparser import *
from MatchingInput import *
from patterns_nb import *
from StringIO import StringIO
import string
import sys
import time
  
# ========================================================================

# This program converts documents marked up according to the Extreme
# Markup Conference XML DTD to HTML.

# It does its processing by recursively examining a stream of
# SAX-like markup tokens generated by an XML parser.

def processDocument (parser, out):
   """
      Process the top-level elements, produce the outer HTML wrapping,
      and dispatch processing for the lower-level elements.
   """
   out.write ("<HTML><BODY>\n")
   for token in parser ():
      if token.isStartElement ("paper"):
         for token in token.children ():
            if token.isStartElement ("front"):
               assert token.next ().isStartElement ("title"), \
                      "missing <PAPER/FRONT/TITLE>"
               title = paraContent (parser)
               out.write ("<HEAD><TITLE>" + title + \
                          "</TITLE></HEAD>\n<BODY>\n<H1 ALIGN=\"CENTER\">" + \
                          title + "</H1>\n")
               outputSections (parser, out)
            elif token.isStartElement ("body"):
               outputSections (parser, out)
   out.write ("<P><I>" + time.asctime () + "</I></P>\n</BODY></HTML>\n")

def outputAuthor (parser, out):
   """
      Process the elements found in the "<author>" part of the document.
   """
   out.write ("<P ALIGN=\"CENTER\"><I>")
   for token in parser ():
      if token.isStartElement ("fname"):
         out.write (paraContent (parser))
      elif token.isStartElement ("surname"):
         out.write (" " + paraContent (parser))
      elif token.isStartElement ("address"):
         out.write ("</I><BR>\n")
         for token in token.children ():
            if token.isStartElement ("web"):
               url = paraContent (parser)
               out.write ("<BR>\n<A HREF=\"http://" + url + \
                          "\" TARGET=\"_new\">" + url + "</A>")
            elif token.isStartElement ("email"):
               url = paraContent (parser)
               out.write ("<BR>\n<A HREF=\"mailto:" + url + "\"" + \
                          ">" + url + "</A>")
            elif token.isStartElement ("phone"):
               out.write ("<BR>\n" + paraContent (parser))
            elif token.isStartElement ("affil"):
               out.write (paraContent (parser) + "</BR>\n")
            elif token.isStartElement ():
               out.write (paraContent (parser) + " ")
      elif token.isStartElement ("bio"):
         out.write ("</P>\n<P>")
         outputSections (parser, out)
   out.write ("</P>\n")

def outputSections (parser, out, secNum = ""):
   """
      Do the processing for what's found inside the top-level
      elements, within the various levels of sections and
      within the various kinds of lists.
   """
   secCount = 0
   for token in parser ():
      if token.isStartElement ("para"):
         out.write ("<P>" + paraContent (parser) + "</P>\n")
      elif token.isStartElement ("randlist"):
         outputRandlist (parser, out)
      elif token.isStartElement ("deflist"):
         out.write ("<DL>")
         for token in token.children ():
            if token.isStartElement ("def.item"):
               for token in token.children ():
                  if token.isStartElement ("def.term"):
                     out.write ("<DT>" + paraContent (parser) + "</DT>")
                  elif token.isStartElement ():
                     out.write ("<DD>")
                     outputSections (parser, out)
                     out.write ("</DD>\n")
         out.write ("</DL>\n")
      elif token.isStartElement ("section"):
         assert token.next ().isStartElement ("title"), "no <SECTION/TITLE>"
         secCount += 1
         out.write ("<H2>" + str (secCount) + ". " + \
                    paraContent (parser) + "</H2>\n")
         outputSections (parser, out, str (secCount))
      elif token.isStartElement ("subsec1"):
         assert token.next ().isStartElement ("title"), "no <SUBSEC1/TITLE>"
         secCount += 1
         out.write ("<H3>" + secNum + "." + str (secCount) + " " + \
                    paraContent (parser) + "</H3>\n")
         outputSections (parser, out, secNum + "." + str (secCount))
      elif token.isStartElement ("abstract"):
         out.write ("<H3>Abstract</H3>\n")
         outputSections (parser, out)
      elif token.isStartElement ("author"):
         outputAuthor (parser, out)
      elif token.isStartElement ("keywords"):
         out.write ("<H3>Keywords</H3>\n")
         out.write ("<P><UL>")
         for token in token.children ():
            if token.isStartElement ("keyword"):
               out.write ("<LI>" + paraContent (parser) + "</LI>\n")
         out.write ("</UL></P>\n")

def outputRandlist (parser, out):
   """
      Factor out the processing for an unordered list -- it would
      otherwise occur in more than one place.
   """
   out.write ("<UL>")
   for token in parser ():
      if token.isStartElement ("li"):
         out.write ("<LI>")
         outputSections (parser, out)
         out.write ("</LI>\n")
   out.write ("</UL>\n")

def paraContent (parser):
   """
      Process paragraph content and its like, including character-level
      escaping.
   """
   out = StringIO ()
   for token in parser ("W"):
      if token.isCharacters ():
         subject = MatchingInput (token.characters)
         while True:
            if subject ^ (AnyOfP (string.printable) - AnyOfP ("<>&")) [1:]:
               out.write (subject.AllMatched)
            elif subject ^ IsP ("<"):
               out.write ("&lt;")
            elif subject ^ IsP (">"):
               out.write ("&gt;")
            elif subject ^ IsP ("&"):
               out.write ("&amp;")
            elif subject ^ MoveP (1):
               out.write ("&#" + str (ord (subject.AllMatched)) + ";")
            else:
               break
      elif token.isStartElement ("verbatim"):
         out.write ("<PRE>" + paraContent (parser) + "</PRE>")
      elif token.isStartElement ("web"):
         url = paraContent (parser)
         out.write ("<A HREF=\"http://" + url + "\" TARGET=\"_new\">" + \
                    url + "</A>")
      elif token.isStartElement ("randlist"):
         outputRandlist (parser, out)
   return out.getvalue ()

# Last but not least, start the whole thing rolling, by starting up
# an XML parser, and feeding that and an output destination to the
# top-level element processor.

time.clock()

processDocument (anXMLParser (documentEntity = open (sys.argv [1])),
                 sys.stdout)

print >>sys.stderr, "Times:", int (time.clock () * 1000), "milliseconds"
