Index: tests/test_tokenizer.py
===================================================================
--- tests/test_tokenizer.py	(revision 438)
+++ tests/test_tokenizer.py	(working copy)
@@ -40,6 +40,10 @@
     def processStartTag(self, token):
         self.outputTokens.append([u"StartTag", token["name"], token["data"]])
 
+    def processEmptyTag(self, token):
+        # TODO: convert tests to reflect EmptyTags
+        self.outputTokens.append([u"StartTag", token["name"], token["data"]])
+
     def processEndTag(self, token):
         self.outputTokens.append([u"EndTag", token["name"]])
 
@@ -76,6 +80,14 @@
             outputTokens.append(token)
     return outputTokens
 
+def normalizeTokens(tokens):
+    """ convert array of attributes to a dictionary """
+    # TODO: convert tests to reflect arrays
+    for token in tokens:
+        if token[0] == 'StartTag':
+            token[2] = dict(token[2][::-1])
+    return tokens
+
 def tokensMatch(expectedTokens, recievedTokens):
     """Test whether the test has passed or failed
 
@@ -101,7 +113,7 @@
         parser = TokenizerTestParser(test['contentModelFlag'], 
                                      test['lastStartTag'])
             
-        tokens = parser.parse(test['input'])
+        tokens = normalizeTokens(parser.parse(test['input']))
         tokens = concatenateCharacterTokens(tokens)
         errorMsg = "\n".join(["\n\nContent Model Flag:",
                               test['contentModelFlag'] ,
Index: src/tokenizer.py
===================================================================
--- src/tokenizer.py	(revision 438)
+++ src/tokenizer.py	(working copy)
@@ -104,6 +104,10 @@
             self.tokenQueue.append({"type": "ParseError", "data":
               _("Solidus (/) incorrectly placed in tag.")})
 
+        # XML/XHTML enablement hook
+        if self.currentToken["type"] == "StartTag" and data == u">":
+            self.currentToken["type"] = "EmptyTag"
+
         # The character we just consumed need to be put back on the stack so it
         # doesn't get lost...
         self.stream.queue.append(data)
@@ -259,17 +263,10 @@
         # internal usage.
 
         token = self.currentToken
-        # For start tags convert attribute list into a distinct dictionary
-        if token["type"] == "StartTag":
-            # We need to remove the duplicate attributes and convert attributes
-            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
 
-            # AT When Python 2.4 is widespread we should use
-            # dict(reversed(token.data))
-            token["data"] = dict(token["data"][::-1])
         # If an end tag has attributes it's a parse error and they should
         # be removed
-        elif token["type"] == "EndTag" and token["data"]:
+        if token["type"] == "EndTag" and token["data"]:
             self.tokenQueue.append({"type": "ParseError", "data":
               _("End tag contains unexpected attributes.")})
             token["data"] = {}
@@ -349,7 +346,7 @@
                 self.state = self.states["closeTagOpen"]
             elif data in asciiLetters:
                 self.currentToken =\
-                  {"type": "StartTag", "name": data.lower(), "data": []}
+                  {"type": "StartTag", "name": data, "data": []}
                 self.state = self.states["tagName"]
             elif data == u">":
                 # XXX In theory it could be something besides a tag name. But
@@ -405,7 +402,7 @@
             # the stack.
             self.stream.queue.extend(charStack)
 
-            if self.currentToken["name"] == "".join(charStack[:-1]).lower() \
+            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
               and charStack[-1] in (spaceCharacters |
               frozenset((u">", u"/", u"<", EOF))):
                 # Because the characters are correct we can safely switch to
@@ -426,7 +423,7 @@
             data = self.stream.char()
             if data in asciiLetters:
                 self.currentToken =\
-                  {"type": "EndTag", "name": data.lower(), "data": []}
+                  {"type": "EndTag", "name": data, "data": []}
                 self.state = self.states["tagName"]
             elif data == u">":
                 self.tokenQueue.append({"type": "ParseError", "data":
@@ -449,12 +446,9 @@
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.states["beforeAttributeName"]
-        elif data in asciiLowercase:
+        elif data in asciiLetters:
             self.currentToken["name"] += data +\
-              self.stream.charsUntil(asciiLowercase, True)
-        elif data in asciiUppercase:
-            self.currentToken["name"] += data.lower() +\
-              self.stream.charsUntil(asciiLetters, True).lower()
+              self.stream.charsUntil(asciiLetters, True)
         elif data == u">":
             self.emitCurrentToken()
         elif data == u"<" or data == EOF:
@@ -470,8 +464,8 @@
         data = self.stream.char()
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
-        elif data in asciiUppercase:
-            self.currentToken["data"].append([data.lower(), ""])
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
             self.state = self.states["attributeName"]
         elif data == u">":
             self.emitCurrentToken()
@@ -489,14 +483,10 @@
         leavingThisState = True
         if data == u"=":
             self.state = self.states["beforeAttributeValue"]
-        elif data in asciiLowercase:
+        elif data in asciiLetters:
             self.currentToken["data"][-1][0] += data +\
-              self.stream.charsUntil(asciiLowercase, True)
+              self.stream.charsUntil(asciiLetters, True)
             leavingThisState = False
-        elif data in asciiUppercase:
-            self.currentToken["data"][-1][0] += data.lower() +\
-              self.stream.charsUntil(asciiLetters, True).lower()
-            leavingThisState = False
         elif data == u">":
             # XXX If we emit here the attributes are converted to a dict
             # without being checked and when the code below runs we error
@@ -535,8 +525,8 @@
             self.state = self.states["beforeAttributeValue"]
         elif data == u">":
             self.emitCurrentToken()
-        elif data in asciiUppercase:
-            self.currentToken["data"].append([data.lower(), ""])
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
             self.state = self.states["attributeName"]
         elif data == u"/":
             self.processSolidusInTag()
Index: src/constants.py
===================================================================
--- src/constants.py	(revision 438)
+++ src/constants.py	(working copy)
@@ -129,6 +129,8 @@
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
 
+asciiLower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase])
+
 # Heading elements need to be ordered 
 headingElements = (
     "h1",
Index: src/html5parser.py
===================================================================
--- src/html5parser.py	(revision 438)
+++ src/html5parser.py	(working copy)
@@ -27,7 +27,7 @@
 from treebuilders import simpletree
 
 import utils
-from constants import contentModelFlags, spaceCharacters
+from constants import contentModelFlags, spaceCharacters, asciiLower
 from constants import scopingElements, formattingElements, specialElements
 from constants import headingElements, tableInsertModeElements
 
@@ -96,6 +96,7 @@
         # XXX This is temporary for the moment so there isn't any other
         # changes needed for the parser to work with the iterable tokenizer
         for token in self.tokenizer:
+            token = self.normalizeToken(token)
             type = token["type"]
             method = getattr(self.phase, "process%s" % type, None)
             if type in ("Characters", "SpaceCharacters", "Comment"):
@@ -124,6 +125,28 @@
         """This error is not an error"""
         pass
 
+    def normalizeToken(self, token):
+        """ HTML5 specific normalizations to the token stream """
+       
+        if token["type"] == "EmptyTag":
+            token["type"] = "StartTag"
+
+        if token["type"] == "StartTag":
+            token["name"] = token["name"].translate(asciiLower)
+
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            token["data"] = dict([(attr.translate(asciiLower), value)
+                for attr,value in token["data"][::-1]])
+
+        elif token["type"] == "EndTag":
+            token["name"] = token["name"].lower()
+
+        return token
+
     #XXX - almost everthing after this point should be moved into a
     #seperate treebuilder object