Index: tests/test_tokenizer.py =================================================================== --- tests/test_tokenizer.py (revision 438) +++ tests/test_tokenizer.py (working copy) @@ -40,6 +40,10 @@ def processStartTag(self, token): self.outputTokens.append([u"StartTag", token["name"], token["data"]]) + def processEmptyTag(self, token): + # TODO: convert tests to reflect EmptyTags + self.outputTokens.append([u"StartTag", token["name"], token["data"]]) + def processEndTag(self, token): self.outputTokens.append([u"EndTag", token["name"]]) @@ -76,6 +80,14 @@ outputTokens.append(token) return outputTokens +def normalizeTokens(tokens): + """ convert array of attributes to a dictionary """ + # TODO: convert tests to reflect arrays + for token in tokens: + if token[0] == 'StartTag': + token[2] = dict(token[2][::-1]) + return tokens + def tokensMatch(expectedTokens, recievedTokens): """Test whether the test has passed or failed @@ -101,7 +113,7 @@ parser = TokenizerTestParser(test['contentModelFlag'], test['lastStartTag']) - tokens = parser.parse(test['input']) + tokens = normalizeTokens(parser.parse(test['input'])) tokens = concatenateCharacterTokens(tokens) errorMsg = "\n".join(["\n\nContent Model Flag:", test['contentModelFlag'] , Index: src/tokenizer.py =================================================================== --- src/tokenizer.py (revision 438) +++ src/tokenizer.py (working copy) @@ -104,6 +104,10 @@ self.tokenQueue.append({"type": "ParseError", "data": _("Solidus (/) incorrectly placed in tag.")}) + # XML/XHTML enablement hook + if self.currentToken["type"] == "StartTag" and data == u">": + self.currentToken["type"] = "EmptyTag" + # The character we just consumed need to be put back on the stack so it # doesn't get lost... self.stream.queue.append(data) @@ -259,17 +263,10 @@ # internal usage. token = self.currentToken - # For start tags convert attribute list into a distinct dictionary - if token["type"] == "StartTag": - # We need to remove the duplicate attributes and convert attributes - # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} - # AT When Python 2.4 is widespread we should use - # dict(reversed(token.data)) - token["data"] = dict(token["data"][::-1]) # If an end tag has attributes it's a parse error and they should # be removed - elif token["type"] == "EndTag" and token["data"]: + if token["type"] == "EndTag" and token["data"]: self.tokenQueue.append({"type": "ParseError", "data": _("End tag contains unexpected attributes.")}) token["data"] = {} @@ -349,7 +346,7 @@ self.state = self.states["closeTagOpen"] elif data in asciiLetters: self.currentToken =\ - {"type": "StartTag", "name": data.lower(), "data": []} + {"type": "StartTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": # XXX In theory it could be something besides a tag name. But @@ -405,7 +402,7 @@ # the stack. self.stream.queue.extend(charStack) - if self.currentToken["name"] == "".join(charStack[:-1]).lower() \ + if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \ and charStack[-1] in (spaceCharacters | frozenset((u">", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to @@ -426,7 +423,7 @@ data = self.stream.char() if data in asciiLetters: self.currentToken =\ - {"type": "EndTag", "name": data.lower(), "data": []} + {"type": "EndTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": self.tokenQueue.append({"type": "ParseError", "data": @@ -449,12 +446,9 @@ data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] - elif data in asciiLowercase: + elif data in asciiLetters: self.currentToken["name"] += data +\ - self.stream.charsUntil(asciiLowercase, True) - elif data in asciiUppercase: - self.currentToken["name"] += data.lower() +\ - self.stream.charsUntil(asciiLetters, True).lower() + self.stream.charsUntil(asciiLetters, True) elif data == u">": self.emitCurrentToken() elif data == u"<" or data == EOF: @@ -470,8 +464,8 @@ data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) - elif data in asciiUppercase: - self.currentToken["data"].append([data.lower(), ""]) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u">": self.emitCurrentToken() @@ -489,14 +483,10 @@ leavingThisState = True if data == u"=": self.state = self.states["beforeAttributeValue"] - elif data in asciiLowercase: + elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ - self.stream.charsUntil(asciiLowercase, True) + self.stream.charsUntil(asciiLetters, True) leavingThisState = False - elif data in asciiUppercase: - self.currentToken["data"][-1][0] += data.lower() +\ - self.stream.charsUntil(asciiLetters, True).lower() - leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error @@ -535,8 +525,8 @@ self.state = self.states["beforeAttributeValue"] elif data == u">": self.emitCurrentToken() - elif data in asciiUppercase: - self.currentToken["data"].append([data.lower(), ""]) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u"/": self.processSolidusInTag() Index: src/constants.py =================================================================== --- src/constants.py (revision 438) +++ src/constants.py (working copy) @@ -129,6 +129,8 @@ digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) +asciiLower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase]) + # Heading elements need to be ordered headingElements = ( "h1", Index: src/html5parser.py =================================================================== --- src/html5parser.py (revision 438) +++ src/html5parser.py (working copy) @@ -27,7 +27,7 @@ from treebuilders import simpletree import utils -from constants import contentModelFlags, spaceCharacters +from constants import contentModelFlags, spaceCharacters, asciiLower from constants import scopingElements, formattingElements, specialElements from constants import headingElements, tableInsertModeElements @@ -96,6 +96,7 @@ # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer for token in self.tokenizer: + token = self.normalizeToken(token) type = token["type"] method = getattr(self.phase, "process%s" % type, None) if type in ("Characters", "SpaceCharacters", "Comment"): @@ -124,6 +125,28 @@ """This error is not an error""" pass + def normalizeToken(self, token): + """ HTML5 specific normalizations to the token stream """ + + if token["type"] == "EmptyTag": + token["type"] = "StartTag" + + if token["type"] == "StartTag": + token["name"] = token["name"].translate(asciiLower) + + # We need to remove the duplicate attributes and convert attributes + # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} + + # AT When Python 2.4 is widespread we should use + # dict(reversed(token.data)) + token["data"] = dict([(attr.translate(asciiLower), value) + for attr,value in token["data"][::-1]]) + + elif token["type"] == "EndTag": + token["name"] = token["name"].lower() + + return token + #XXX - almost everthing after this point should be moved into a #seperate treebuilder object