# Module xmlparser (An XML Paser Interface)

# The core of a truly streaming XML parser, based on an underlying
# SAX parser.

from xml.sax import saxexts, saxlib
import stackless
from  StringIO import StringIO
import string
from patterns_nb import *
from textpatterns_nb import *
from MatchingInput import *

# ========================================================================

# All the kinds of things returned from the XML parser.
# Each of these corresponds to a handler method defined by the Python
# SAX API.

class markupToken (object):
   """
      The base type of all the "markup token" classes that follow.
      It defines all the common attributes and methods, and
      implements two useful methods: "next" and "peek".
   """
   def __init__ (self):
      self.generator = None
   def isCharacters (self):
      return False
   def isEndDocument (self, *names):
      return False
   def isEndElement (self, *names):
      return False
   def isIgnorableWhitespace (self):
      return False
   def isProcessingInstruction (self, *targets):
      return False
   def isSetDocumentLocator (self):
      return False
   def isSkippedEntity (self, *names):
      return False
   def isStartDocument (self, *names):
      return False
   def isStartElement (self, *names):
      return False
   def isException (self, *severities):
      return False
   def isEntity (self, *names):
      return False
   def isNotationDecl (self, *names):
      return False
   def isUnparsedEntityDecl (self, *names):
      return False
   def isTheBegining (self):
      return False
   def isTheEnd (self):
      return False
   def next (self):
      """
         Get the next markup token from a stream of markup tokens
         that is most commonly generated.  This method is useful
         when the user wants to grab the next token without
         going back to the generation mechanism.
      """
      if self.generator is None:
         return MarkupToken ()
      else:
         return self.generator.next ()
   def peek (self, count = 0):
      """
         Get a following markup token from a stream of markup tokens
         that is being generated.  This method is useful when some
         "right-hand side" context is needed for the currently
         viewed token.  .peek takes an optional argument, which is
         the offset of the token to be 
      """
      if self.generator is None:
         return MarkupToken ()
      else:
         return self.generator.peek (count)
   def children (self, options = "", endElement = None, endAttribute = None):
      """
         Another way of invoking the originating XML Parser's
         XML token generator, using (in practice usually a
         start-tag) token returned from the parser.
      """
      return xmlParserTokenGenerator \
        (self.generator.parser, options, endElement, endAttribute).__iter__ ()

# A useful function for the following class implementations:

def strData (text, quote = ""):
   out = StringIO ()
   subject = MatchingInput (text)
   while True:
      if subject ^ RangeP (" ", "~", "<>&" + quote) [1:]:
         out.write (subject.AllMatched)
      elif subject ^ AnyOfP ("<>&"):
         out.write ({"<": "&lt;", ">": "&gt;", "&": "&amp;"} \
                    [subject.AllMatched])
      elif subject ^ AnyOfP (quote):
         out.write ({"\"": "&quote;", "'": "&apos;"} [subject.AllMatched])
      elif subject ^ MoveP (1):
         out.write ("&#" + str (ord (subject.AllMatched)) + ";")
      else:
         break
   return out.getvalue ()

# The following classes correspond one-to-one with the markup token
# callback functions defined for the SAX parser.  For each, its
# attributes are the arguments that would be passed to the markup
# parser.  There are a few extra class, which are noted below.

# In each case, except for the start element attribute objects,
# which are returned as part of a start element token, there's a
# method that will return True of False depending if the token is
# that that type of token, and in addition, if one or more
# arguments are specified, allows the specification of a name
# or target to be included in the test.

class markupTokenCharacters (markupToken):
   """
      Corresponds to the SAX .characters call-back.
   """
   def __init__ (self, characters):
      self.characters = characters
      self.generator = None
   def isCharacters (self):
      return True
   def __str__ (self):
      return strData (self.characters)

class markupTokenEndDocument (markupToken):
   """
      Corresponds to the SAX .startDocument call-back.
   """
   def isEndDocument (self):
      return True
   def __str__ (self):
      return "<!-- END DOCUMENT -->"

class markupTokenEndElement (markupToken):
   """
      Corresponds to the SAX .endElement call-back.
   """
   def __init__ (self, name, depth):
      self.name = name
      self.usedAsEnd = False
      self.elementDepth = depth
      self.generator = None
   def isEndElement (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      return "</" + self.name + ">"

class markupTokenIgnorableWhitespace (markupToken):
   """
      Corresponds to the SAX .ignorableWhitespace call-back.
   """
   def __init__ (self, characters):
      self.characters = characters
      self.generator = None
   def isIgnorableWhitespace (self):
      return True
   def __str__ (self):
      return strData (self.characters)

class markupTokenProcessingInstruction (markupToken):
   """
      Corresponds to the SAX .processingInstruction call-back.
   """
   def __init__ (self, target, data):
      self.target = target
      self.data = data
      self.generator = None
   def isProcessingInstruction (self, *targets):
      return len (targets) == 0 or self.target in targets
   def __str__ (self):
      out = StringIO ()
      out.write ("<?")
      if self.target is not None:
         out.write (self.target)
         if self.data is not None:
            out.write (" ")
      if self.data is not None:
         out.write (self.data)
      out.write ("?>")
      return out.getvalue ()

class markupTokenSetDocumentLocator (markupToken):
   """
      Corresponds to the SAX .setDocumentLocation call-back.
   """
   def __init__ (self, columnNumber, lineNumber, publicId, systemId):
      self.columnNumber = columnNumber
      self.lineNumber = lineNumber
      self.publicId = publicId
      self.systemId = systemId
      self.generator = None
   def isSetDocumentLocator (self):
      return True
   def __str__ (self):
      out = StringIO ()
      out.write ("<!-- SET DOCUMENT LOCATOR: COL=" + \
                 str (self.columnNumber) + ", LINE=" + str (self.lineNumber))
      if self.publicId is not None:
         out.write (", PUBLIC ID=\"" + strData (self.publicId, "\"") + "\"")
      if self.systemId is not None:
         out.write (", SYSTEM ID=\"" + strData (self.systemId, "\"") + "\"")
      out.write (" -->")
      return out.getvalue ()

class markupTokenSkippedEntity (markupToken):
   """
      Corresponds to the SAX .skippedEntity call-back.
   """
   def __init__ (self, name):
      self.name = name
      self.generator = None
   def isSkippedEntity (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      return "&" + self.name + ";"

class markupTokenStartDocument (markupToken):
   """
      Corresponds to the SAX .startDocument call-back.
   """
   def isStartDocument (self):
      return True
   def __str__ (self):
      return "<!-- START DOCUMENT -->"

class markupTokenStartElement (markupToken):
   """
      Corresponds to the SAX .startElement call-back.
   """
   def __init__ (self, name, attrs, depth):
      self.name = name
      self.attrs = attrs
      self.usedAsEnd = False
      self.elementDepth = depth
      self.generator = None
   def isStartElement (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      out = StringIO ()
      out.write ("<" + self.name)
      for attr in self.attrs.itervalues ():
         out.write (" " + str (attr))
      out.write (">")
      return out.getvalue ()

class markupAttribute (object):
   """
      Encapsulates a start element's attribute.
   """
   def __init__ (self, name, type, value):
      self.name = name
      self.type = type
      self.value = value
   def __str__ (self):
      return self.name + "=\"" + strData (self.value, "\"") + "\""

class markupTokenException (markupToken):
   """
      Returned to report a markup exception. The .severity
      property indicates the general class of the exception:
        0 = warning
        1 = "reportable" error
        2 = severe error
        3 = system error, usually a fault in the parser or
            in attempting to resolve an external entity.
   """
   def __init__ (self, severity, message,
                 columnNumber, lineNumber, publicId, systemId):
      self.severity = severity
      self.message = message
      self.columnNumber = columnNumber
      self.lineNumber = lineNumber
      self.publicId = publicId
      self.systemId = systemId
      self.generator = None
   def isException (self, *severities):
      return len (severities) == 0 or self.severity in severities
   def __str__ (self):
      return "<!-- EXCEPTION: " + self.message + " -->"

class markupTokenEntity (markupToken):
   """
      Corresponds to the SAX .resolveEntity call-back.
      This token won't be returned to the client if an
      entity resolver was specified when creating the
      anXMLParser object.
   """
   def __init__ (self, name, publicId, systemId, parser):
      self.name = name
      self.publicId = publicId
      self.systemId = systemId
      self.parser = parser
      self.generator = None
   def setEntity (self, inputFile):
      """
         Return to the XML parser the resolution of the entity
         described by the markup token.  What's returned
         should be a string, a Unicode string, or a file-like
         object.
      """
      self.parser.setEntity (inputFile)
   def isEntity (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      out = StringIO ()
      out.write ("&" + self.name + ";<!-- ENTITY " + self.name)
      if self.publicId is not None:
         out.write (" PUBLIC \"" + strData (self.publicId, "\"") + "\"")
      else:
         out.write (" SYSTEM")
      if self.systemId is not None:
         out.write (" \"" + strData (self.systemId, "\"") + "\"")
      out.write (" -->")
      return out.getvalue ()

class markupTokenNotationDecl (markupToken):
   """
      Corresponds to the SAX .notationDecl call-back.
   """
   def __init__ (self, name, publicId, systemId):
      self.name = name
      self.publicId = publicId
      self.systemId = systemId
      self.generator = None
   def isNotationDecl (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      out = StringIO ()
      out.write ("<!NOTATION " + self.name)
      if self.publicId is not None:
         out.write (" PUBLIC \"" + strData (self.publicId, "\"") + "\"")
      else:
         out.write (" SYSTEM")
      if self.systemId is not None:
         out.write (" \"" + strData (self.systemId, "\"") + "\"")
      out.write (">")
      return out.getvalue ()

class markupTokenUnparsedEntityDecl (markupToken):
   """
      Corresponds to the SAX .unparsedEntityDecl call-back.
   """
   def __init__ (self, name, publicId, systemId, ndata):
      self.name = name
      self.publicId = publicId
      self.systemId = systemId
      self.ndata = ndata
      self.generator = None
   def isUnparsedEntityDecl (self, *names):
      return len (names) == 0 or self.name in names
   def __str__ (self):
      out = StringIO ()
      out.write ("<!ENTITY " + self.name)
      if self.publicId is not None:
         out.write (" PUBLIC \"" + strData (self.publicId, "\"") + "\"")
      else:
         out.write (" SYSTEM")
      if self.systemId is not None:
         out.write (" \"" + strData (self.systemId, "\"") + "\"")
      if self.ndata is not None:
         out.write (" NDATA " + self.ndata)
      out.write (">")
      return out.getvalue ()

class markupTokenTheBegining (markupToken):
   """
      A special token returned when the document's entity
      wasn't specified when creating anXMLParser object.
      It requests an entity.
   """
   def __init__ (self, parser):
      self.parser = parser
      self.generator = None
   def setEntity (self, inputFile):
      """
         Used by the client to provide the parser with the
         document's entity.  What's passed should be a string,
         a Unicode string or a file-like object.
      """
      self.parser.setEntity (inputFile)
   def isTheBegining (self):
      return True
   def __str__ (self):
      return "<!-- THE BEGINNING -->"

class markupTokenTheEnd (markupToken):
   def isTheEnd (self):
      return True
   def __str__ (self):
      return "<!-- THE END -->"

# ========================================================================

# An encapsulation of the XML parser itself.

class xmlParserTokenGenerator (object):
   """
      A generator of XML tokens from an XML Parser.
      Generators are typically created using methods of a parser,
      rather than being created directly by clients.
   """
   def __iter__ (self):
      """
         Generate the tokens returned from an XML parser.
         This method can be invoked while another invocation of it is
         still generating tokens.  In that case it generates tokens
         up to but not including the end tag of the element, if any,
         that was the currently opened one when the generator method
         was invoked.  In that case the end tag is not returned at
         all.
         If there are no opened elements at that point, as there are
         when this method is first invoked, all tokens to the end of
         the stream are returned.
      """
      self.startingDepth = len (self.parser.openElements)
      # If at an end element, the element being ended has not been
      # popped yet, so allow for it to just about be popped.
      if self.goToEnd:
         self.startingDepth = 0
      else:
         if self.parser.currentToken is not None and \
                  self.parser.currentToken.isEndElement ():
            self.startingDepth -= 1
         if self.endOneOut:
            self.startingDepth -= 1
      return self
   def next (self):
      """
         Return the next token from the xml token generator, or stop
         the iteration.  This method either returns a single token
         or raises an end-of-iteration exception.
      """
      if not self.exitNextTime:
         while True:
            if len (self.parser.pendingTokens) > 0:
               self.parser.currentToken = self.parser.pendingTokens.pop (0)
            elif self.parser.AllDone:
               break
            else:
               self.parser.channel.receive ()
            if self.parser.currentToken.isEndElement ():
               if len (self.parser.openElements) <= self.startingDepth:
                  break
               self.exitNextTime = self.endElement is not None and \
                             self.parser.currentToken.name in self.endElement
               if self.endAttribute is not None:
                  for attr in self.parser.openElements [-1].attrs:
                     if self.endAttribute.has_key (attr.name) and \
                              attr.value == self.endAttribute [attr.name]:
                        self.exitNextTime = True
                        break
                  else:
                     self.exitNextTime = False
            if not (self.suppressData and \
                    self.parser.currentToken.isCharacters () and \
                    len (string.strip (self.parser.currentToken.characters,
                                       " \n\r\t")) == 0):
               self.parser.currentToken.generator = self
               return self.parser.currentToken
      raise StopIteration
   def peek (self, count = 0):
      """
         Take a look at one of the following markup tokens without
         actually consuming it, or any that precede it and haven't
         otherwise been consumed.
      """
      savedToken = self.parser.currentToken
      while not self.parser.AllDone and \
            len (self.parser.pendingTokens) <= count:
         self.parser.channel.receive ()
         self.parser.pendingTokens.append (self.parser.currentToken)
      if count < len (self.parser.pendingTokens):
         return self.parser.pendingTokens [count]
      else:
         return markupToken ()
   def __init__ (self, parser, options = "",
                 endElement = None, endAttribute = None):
      self.parser = parser
      if type (options) != type ("") and type (options) != type (u""):
         options = ""
      else:
         options = string.upper (options)
      self.suppressData = "W" not in options
      self.endElement = endElement
      if endElement is not None:
         if len (endElement) == 0:
            self.endElement = None
         elif type (endElement) != type ([]):
            self.endElement = [endElement]
      self.endAttribute = endAttribute
      if endAttribute is not None and len (endAttribute) == 0:
         self.endAttribute = None
      self.endOneOut = "O" in options or \
                       "E" not in options and \
                          (endElement is not None or endAttribute is not None)
      self.goToEnd = "Z" in options or \
                     "O" not in options and "E" not in options and \
                        (endElement is not None or endAttribute is not None)
      self.exitNextTime = False

class anXMLParser (saxlib.DocumentHandler):
   """
      An encapsulation of an XML SAX reader as a generator of XML
      markup tokens.
   """
# Methods used by clients of this class.
   def __init__ (self, documentEntity = None, include = "",
                       entityResolver = None):
      """
         Creating anXMLParser instance starts a SAX parser in a
         separate tasklet.  The attributes of this class mostly
         have to do with managing communication with the
         parser tasklet, and with passing information to and from it.
      """
      self.currentToken = None
      self.currentOpenEntity = documentEntity
      self.openElements = []
      if type (include) == type (""):
         self.include = string.upper (include)
      else:
         self.include = ""
      self.entityCallBack = entityResolver
      self.AllDone = False
      self.pendingTokens = []
      self.saxParser = None
      self.channel = stackless.channel ()
      stackless.tasklet (self.__parserTaskletTopLevel) ()
   def __del__ (self):
      if self.saxParser is not None:
         self.AllDone = True
         self.channel.receive ()
         del self.saxParser
         self.saxParser = False
   def __call__ (self, options = "", endElement = None, endAttribute = None):
      """
         Invoke an XML token generator for this instance of the parser.
      """
      return xmlParserTokenGenerator \
                (self, options, endElement, endAttribute).__iter__ ()
   def setEntity (self, inputFile):
      """
         This method should be called whenever the XML parser returns
         a markupTokenTheBegining or markupTokenEntity token.
         It must be passed a string, Unicode string or file-like object.
         Alternatively, the .setEntity method of the returned token
         can be called.
      """
      if type (inputFile) == type ("") or type (inputFile) == type (u""):
         self.currentOpenEntity = StringIO (inputFile)
      else:
         self.currentOpenEntity = inputFile
# Methods only used internally to this class.
   def __parserTaskletTopLevel (self):
      """
         The top-level function within the parser tasklet.  It's
         mostly concerned with setting up and cleanly shutting
         down the SAX parser.
      """
      self.saxParser = saxexts.make_parser ()
      try:
         self.saxParser.setErrorHandler (self)
         self.saxParser.setDTDHandler (self)
         self.saxParser.setDocumentHandler (self)
         self.saxParser.setEntityResolver (self)
         if self.currentOpenEntity is None:
            self.__emit (markupTokenTheBegining (self))
            if self.currentOpenEntity is None:
               self.currentToken = markupTokenException (3,
                                             "Entity resolution failed.",
                                             0, 0, None, None)
               self.AllDone = True
               return self.currentToken
         self.saxParser.parseFile (self.currentOpenEntity)
      except StandardError, e:
         self.currentToken = markupTokenException (3,
                                                   str (e), 0, 0, None, None)
      except IOError, e:
         self.currentToken = markupTokenException (3, e.getMessage (),
                 e.getColumnNumber (), e.getLineNumber (),
                 e.getPublicId (), e.getSystemId ())
      except saxlib.SAXParseException, e:
         self.currentToken = markupTokenException (3, e.getMessage (),
                 e.getColumnNumber (), e.getLineNumber (),
                 e.getPublicId (), e.getSystemId ())
      except SystemExit:
         self.currentToken = markupTokenTheEnd ()
      except:
         self.currentToken = markupTokenException (3,
                          "Unknown exception in parser tasklet.",
                          0, 0, None, None)
      else:
         self.currentToken = markupTokenTheEnd ()
      self.AllDone = True
      del self.saxParser
      self.saxParser = None
      self.channel.send (self.currentToken)
      return
   def __emit (self, currentToken):
      """
         Return a parser token to the client.
      """
      self.currentToken = currentToken
      self.channel.send (currentToken)
      if self.AllDone:
         raise SystemExit
# Handler methods, called by the SAX parser
   def characters (self, ch, start, length):
      self.__emit (markupTokenCharacters (ch))
   def endDocument (self):
      if "S" in self.include:
         self.__emit (markupTokenEndDocument ())
   def endElement (self, name):
      self.__emit (markupTokenEndElement (name, len (self.openElements)))
      self.openElements.pop ()
   def ignorableWhitespace (self, ch, start, length):
      if "I" in self.include:
         self.__emit (markupTokenIgnorableWhitespace (ch))
   def processingInstruction (self, target, data):
      if "P" in self.include:
         self.__emit (markupTokenProcessingInstruction (target, data))
   def setDocumentLocator (self, locator):
      if "L" in self.include:
         self.__emit (markupTokenSetDocumentLocator ( \
               locator.getColumnNumber (), locator.getLineNumber (),
               locator.getPublicId (), locator.getSystemId ()))
   def startDocument (self):
      if "S" in self.include:
         self.__emit (markupTokenStartDocument ())
   def skippedEntity (self, name):
      self.__emit (markupTokenSkippedEntity (name))
   def startElement (self, name, attrs):
      attrList = {}
      for i in range (attrs.getLength ()):
         attrList [attrs.getName (i)] = \
               markupAttribute (attrs.getName (i),
                                attrs.getType (i), attrs.getValue (i))
      self.openElements.append (markupTokenStartElement (name, attrList,
                                                      len (self.openElements)))
      self.__emit (self.openElements [-1])
   def notationDecl (self, name, publicId, systemId):
      if "D" in self.include:
         self.__emit (markupTokenUnparsedEntityDecl (name, publicId, systemId))
   def unparsedEntityDecl (self, name, publicId, systemId, ndata):
      if "D" in self.include:
         self.__emit (markupTokenUnparsedEntityDecl (name, publicId,
                                                   systemId, ndata))
   def error (self, exception):
      self.__emit (markupTokenException (1,
            exception.getMessage (),
            exception.getColumnNumber (), exception.getLineNumber (),
            exception.getPublicId (), exception.getSystemId ()))
   def fatalError (self, exception):
      self.__emit (markupTokenException (2,
            exception.getMessage (),
            exception.getColumnNumber (), exception.getLineNumber (),
            exception.getPublicId (), exception.getSystemId ()))
   def warning (self, exception):
      if "W" in self.include:
         self.__emit (markupTokenException (0,
               exception.getMessage (),
               exception.getColumnNumber (), exception.getLineNumber (),
               exception.getPublicId (), exception.getSystemId ()))
   def resolveEntity (self, name, publicId, systemId):
      if self.entityCallBack is None:
         self.__emit (markupTokenEntity (name, publicid, systemid, self))
      else:
         self.currentOpenEntity = self.entityCallBack (name,
                                                       publicId, systemId)
      if self.currentOpenEntity is None:
         info = "unknown entity"
         if name is not None:
            info = "entity " + name
         if systemId is not None:
            info += " ("
            if publicId is not None:
               info += "public id =\"" + publicId + "\", "
            info += "system id =\"" + systemId + "\")"
         elif publicId is not None:
            info += " (public id =\"" + publicId + "\")"
         self.__emit (markupTokenException (3, "Resolution failed for " + info,
                                          0, 0, None, None), False)
         raise SystemExit ()
      if type (self.currentOpenEntity) == type (""):
         self.currentOpenEntity = StringIO (self.currentOpenEntity)
      return self.currentOpenEntity
