FooBar *
* should pop to 'p', not 'b'. +
Foo
* | * should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers != None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers == None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "%s> is not real!" % name
+ self.handle_data('%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a tag should implicitly close the previous tag. + + Para1 Para2 + should be transformed into: + Para1 Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a tag should _not_ implicitly close the previous +tag. + + Alice said:Bob said:Blah + should NOT be transformed into: + Alice said:Bob said:Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a |