[cpif] r208 - trunk/frontend-web

Sat Jun 30 22:30:04 CEST 2007

Author: alvaro
Date: Sat Jun 30 22:30:03 2007
New Revision: 208

Log:
Nuevo parser, que deberia solucionar todos los problemas anteriores.



Modified:
   trunk/frontend-web/parser_bbcode.py
   trunk/frontend-web/parser_html.py
   trunk/frontend-web/parsers.py

Modified: trunk/frontend-web/parser_bbcode.py
==============================================================================

--- trunk/frontend-web/parser_bbcode.py	(original)
+++ trunk/frontend-web/parser_bbcode.py	Sat Jun 30 22:30:03 2007
@@ -29,7 +29,21 @@
     "img": ('<img src="%(arg)s" />', None, True)
 }
 
-from parser_html import escape, unescape
+def escape(text):
+  """Escape the conflictive characters into its entities. Please note that
+he "&" character must be traslated into "&amp;" in the URLS, as said
+in http://www.htmlhelp.com/tools/validator/problems.html#amp"""
+
+  entities = {
+      '"': '&quot;', 
+      "'": "&apos;"
+      }
+# Si hay que hacer otras sustituciones este codigo puede ser util
+#  import re
+#  pat = "(%s)" % "|".join( map(re.escape, entities.keys())  )
+#  return re.sub( pat, lambda m:entities[m.group()], text )
+  import xml.sax.saxutils
+  return xml.sax.saxutils.escape(text, entities)
 
 
 class BBCodeParser:
@@ -40,14 +54,15 @@
   _regexp_start = "\%s.*?\%s" % (_open, _close)
   _regexp_end = "\%s(.*?)\%s" % (_open, _close)
 
-  def __init__(self, allow = True):
-    self._allow = allow
+  def __init__(self):
+    self._status = ""
     self._text = ""
     self._tokens = []
     self._tags = []
     self._parsed = []
   def _reset(self):
     """Resets the internal arrays."""
+    self._status = ""
     self._tokens = []
     self._tags = []
     self._parsed = []
@@ -55,6 +70,15 @@
   def feed(self, text):
     """Sets the text to parse"""
     self._text = text
+    self._reset()
+    self._tokenize()
+    self._status =  self._parse()
+
+  def get_text(self):
+    return self._status, "".join([text for text, dummy in self._parsed])
+
+  def get_tokens(self):
+    return self._status, self._parsed
 
   def _tokenize(self):
     """Tokenize the string and the tags in two separates lists
@@ -72,7 +96,9 @@
     aux = allowed_tags[tag[0]]
     if not tag[1]:
       aux = allowed_tags.get(tag[0] + "*", aux) # Permitimos tag* que no lleva argumentos y puede ser distinto
-    tag[1] = unescape(tag[1].strip('"')) if tag[1] else ""
+    if tag[1]:
+      tag[1] = tag[1].strip('"')
+      tag[1] = tag[1].strip("'")
     return aux[0] % {"arg": escape(tag[1]) if tag[1] else '' }
 
   def _tag_closes(self, tag):
@@ -91,25 +117,21 @@
     """Do the parsing.
     Should be called after _reset, and _tokenize."""
     stack = []
-    self._parsed.append(self._tokens.pop(0))
+    self._parsed.append((True, escape(self._tokens.pop(0))))
     while self._tags:
       tag = self._tags.pop(0)
-      if not self._allow:
-        self._parsed.append("%s%s%s"%(self._open, escape(tag), self._close))
-        self._parsed.append(self._tokens.pop(0))
-        continue
       tag = tag.split("=", 1)
       tag[0] = tag[0].strip()
       if tag[0].startswith("/") and tag[0][1:] in allowed_tags.keys() and stack:
         if tag[0][1:] == stack[-1]:
-          self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+          self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
         elif not allow_errors: 
           return "Error, tratando de cerrar %s%s%s no abierto antes" % (self._open, tag[0], self._close)
         else: # Si alguien activa allow_errors mereceria la mas lenta y dolorosa muerte
               # Esto es una chapuza para usuarios estupidos y chapuceros
-          self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+          self._parsed.append((False, "%s%s%s" % (self._open, tag[0], self._close)))
         if self._tokens and self._tag_closes(tag):
-          self._parsed.append(self._tokens.pop(0))
+          self._parsed.append((True, escape(self._tokens.pop(0))))
 
       elif tag[0] in allowed_tags.keys():
         tag.append(None)
@@ -118,39 +140,29 @@
           tag[1] = self._tokens[0]
           if not allow_errors and not tag[1]:
             return "Error, no se ha pasado el argumento a %s%s%s" % (self._open, tag[0], self._close)
-        self._parsed.append(self._tag_to_html(tag))
+        self._parsed.append((False, self._tag_to_html(tag)))
         if not self._tag_closes(tag) and self._tokens:
           self._tokens.pop(0)
         if self._tokens: 
-          self._parsed.append(self._tokens.pop(0))
+          self._parsed.append((True, escape(self._tokens.pop(0))))
 
       else: # Si no conozco el tag se lo paso tal cual
-        self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+        self._parsed.append((True, "%s%s%s" % (self._open, tag[0], self._close)))
         if self._tokens:
-          self._parsed.append(self._tokens.pop(0))
+          self._parsed.append((True, escape(self._tokens.pop(0))))
     
     if stack and allow_errors:
       while stack: # No me responsabilizo de lo que salga aqui...
-        self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+        self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
     elif stack:
       return "Error, falta cerrar el tag %s%s%s" % (self._open, stack[-1], self._close)
     return False
 
-  def parse(self):
-    """Parses the string"""
-    self._reset()
-    self._tokenize()
-    e =  self._parse()
-    if not e:
-      return e, "".join(self._parsed)
-    return e or "Error, HTML no valido", None
-
-def parse(text, allow = True):
-  """Parses the text w/ BBCode into HTML. The allow argument is optional and
-tells the parser if the BBCode should be translated or no."""
-  parser = BBCodeParser(allow)
+def parse(text):
+  """Parses the text w/ BBCode into HTML."""
+  parser = BBCodeParser()
   parser.feed(text)
-  return parser.parse()
+  return parser.get_tokens()
 
 import unittest
 class TestBBCode(unittest.TestCase):
@@ -235,62 +247,23 @@
     html_list = []
     global allow_errors
     allow_errors = False
-#    print
     for bbcode, html in self.correct_bbcode:
       bbcode_list.append(bbcode)
       html_list.append(html)
       retval, result = parse(bbcode)
-#      print "-->", bbcode
-#      print "<--", html
-#      print "<--", result
-#      print
-      assert not retval
-      self.assertEqual(html, result)
-      retval, result = parse(bbcode)
       assert not retval
-      self.assertEqual(html, result)
+      self.assertEqual(html, "".join(text for dummy, text in result))
     retval, result = parse("".join(bbcode_list))
     assert not retval
-    self.assertEqual("".join(html_list), result)
+    self.assertEqual("".join(html_list), "".join(text for dummy, text in result))
   
-  def test_NoBBCode(self):
-    """Test if the translation goes well in the case that we do not allow BBCode."""
-    list = []
-    global allow_errors
-    allow_errors = False
-    import globales
-    globales.allow_bbcode = False
-    globales.allow_html = True
-    for code, dummy in self.correct_bbcode:
-      list.append(code)
-      retval, result = parse(code, False)
-      assert not retval
-      self.assertEqual(escape(code), result)
-    retval, result = parse("".join(list), False)
-    assert not retval
-    self.assertEqual(escape("".join(list)), result)
-
   def testIncorrectBBCode(self):
     """Test if the translation fails using bad formed BBCode."""
     global allow_errors
     allow_errors = False
     for bbcode in self.incorrect_bbcode:
       retval, result = parse(bbcode)
-#      print "-->", bbcode
-#      print "<--", retval, result
       assert retval
-      retval, result = parse(bbcode)
-      assert retval
-
-  def testIncorrectXSS(self):
-    """Test if the code is XSS safe. Uncomment the print statents"""
-    global allow_errors
-    allow_errors = False
-    for xss in self.xss:
-      retval, result = parse(xss)
-#      print "-->", xss
-#      print "<--", retval, result
-#      assert retval
 
 if __name__ == "__main__":
   unittest.main()

Modified: trunk/frontend-web/parser_html.py
==============================================================================
--- trunk/frontend-web/parser_html.py	(original)
+++ trunk/frontend-web/parser_html.py	Sat Jun 30 22:30:03 2007
@@ -12,6 +12,9 @@
 }
 
 def escape(text):
+  """Escape the conflictive characters into its entities. Please note that
+he "&" character must be traslated into "&amp;" in the URLS, as said
+in http://www.htmlhelp.com/tools/validator/problems.html#amp"""
   entities = {
 #      '<':'&lt;', 
 #      '>':'&gt;', 
@@ -26,32 +29,17 @@
   import xml.sax.saxutils
   return xml.sax.saxutils.escape(text, entities)
 
-def unescape(text):
-  entities = {
-#      '<':'&lt;', 
-#      '>':'&gt;', 
-      '"': '&quot;', 
-      "'": "&apos;"
-#      "&": "&amp;"
-      }
-  import xml.sax.saxutils
-  return xml.sax.saxutils.unescape(text, entities)
-
 from sgmllib import SGMLParser, SGMLParseError
 class HTMLParser(SGMLParser):
-  def __init__(self, allow=True):
-    self.allow = allow
+  def __init__(self):
     SGMLParser.__init__(self)
   def reset(self):                       
-    self.text = []
+    self.entity = False
+    self.tokens = []
     self._stack = []
     SGMLParser.reset(self)
 
   def do_img(self, attrs):
-    if not self.allow:
-      strattrs = "".join([' %s="%s"' % (a, v) for a, v in attrs])
-      self.text.append(escape("<img%(strattrs)s />" % locals()))
-      return
     if not attrs: raise SGMLParseError, "Error, argumentos para <img> no validos"
     attrs_list = []
     for a, v in attrs:
@@ -60,81 +48,109 @@
       else:
         raise SGMLParseError, "Error, argumentos para <img> no validos"
     strattrs = "".join(attrs_list)
-    self.text.append("<img%(strattrs)s />" % locals())
+    text = "<img%(strattrs)s />" % locals()
+    self.tokens.append((False, text))
 
   def end_img(self):
     pass
 
   def do_br(self):
-    if not self.allow:
-      self.text.append(escape("<br />" % locals()))
-      return
-    self.text.append("<br />")
+    self.tokens.append((False, "<br />"))
 
   def end_br(self):
     pass
 
   def unknown_starttag(self, tag, attrs):
-    if tag in allowed_html.keys() and self.allow:
+    if tag in allowed_html.keys():
       attrs_list = []
       for a, v in attrs:
-        if a in allowed_html[tag] and v and self.allow:
+        if a in allowed_html[tag] and v:
           attrs_list.append(' %s="%s"' % (a, escape(v)))
         else:
           raise SGMLParseError, "Error, argumentos para <%s> no validos" % tag
       strattrs = "".join(attrs_list)
-      self.text.append("<%(tag)s%(strattrs)s>" % locals())
+      text = "<%(tag)s%(strattrs)s>" % locals()
+      self.tokens.append((False, text))
       self._stack.append(tag)
     else:
       strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
-      self.text.append(escape("<%(tag)s%(strattrs)s>" % locals()))
+      text = escape("<%(tag)s%(strattrs)s>" % locals())
+      self.tokens.append((False, text)) # XXX
 
   def unknown_endtag(self, tag):         
-    if tag in allowed_html.keys() and self.allow:
-      self.text.append("</%(tag)s>" % locals())
+    if tag in allowed_html.keys():
+      text = "</%(tag)s>" % locals()
+      self.tokens.append((False, text))
       if self._stack and tag == self._stack[-1]:
         self._stack.pop(-1)
       else:
         raise SGMLParseError, "Error, tag <%s> sin abrir" % tag
     else:
-      self.text.append(escape("</%(tag)s>" % locals()))
+      text = escape("</%(tag)s>" % locals())
+      self.tokens.append((False, text)) # XXX
 
   def handle_charref(self, ref):         
-    self.text.append("&amp;#%(ref)s;" % locals())
-
-  def handle_entityref(self, ref):      
-    self.text.append("&amp;%(ref)s" % locals())
+    self.entity = True
+    if not self.tokens: self.tokens.append((True, ""))
+    text = "%s&#%s;" % (self.tokens[-1][1], ref)
+    old_status =  self.tokens[-1][0]
+    self.tokens[-1] = (old_status, text) # FIXME
+
+  def handle_entityref(self, ref): # FIXME
+    self.entity = True
+    if not self.tokens: self.tokens.append((True, ""))
+    text = "%s&%s" % (self.tokens[-1][1], ref)
     import htmlentitydefs
     if htmlentitydefs.entitydefs.has_key(ref):
-      self.text.append(";")
+      text+=";"
+    old_status =  self.tokens[-1][0]
+    self.tokens[-1] = (old_status, text) # FIXME
 
   def handle_data(self, text):           
     import xml.sax.saxutils
-    self.text.append(xml.sax.saxutils.escape(text))
+    if self.entity:
+      self.entity = False
+      if not self.tokens: self.tokens.append((True, ""))
+      old_status =  self.tokens[-1][0]
+      text = "%s%s" % (self.tokens[-1][1], text)
+      self.tokens[-1] = (old_status, text)
+    elif text.startswith("&"):
+      self.entity = True
+      if not self.tokens: self.tokens.append((True, ""))
+      old_status =  self.tokens[-1][0]
+      text = "%s%s" % (self.tokens[-1][1], text)
+      self.tokens[-1] = (old_status, text)
+    else:
+      self.tokens.append((True, text))
 
   def get_text(self):              
     """Return processed HTML as a single string"""
     if self._stack: 
       raise SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
-    return "".join(self.text)
+    return "".join([text for dummy, text in self.tokens])
 
-def parse(text, allow = True):
-  parser = HTMLParser(allow)
+  def get_tokens(self):
+    if self._stack:
+      raise SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
+    return self.tokens
+
+
+def parse(text):
+  parser = HTMLParser()
   parser.reset()
   try:
     parser.feed(text)
-    text = parser.get_text()
-    if not text: raise SGMLParseError, "HTML no valido"
+    tokens = parser.get_tokens()
+    if not tokens: raise SGMLParseError, "HTML no valido"
   except SGMLParseError, e:
     return (e or "HTML no valido", None)
-  return False, text
+  return False, tokens
 
 import unittest
 class TestHTML(unittest.TestCase):
   correct_html = (
-      ('&amp;', '&amp;amp;'), 
       ('<b>caca</b>', '<b>caca</b>'), 
-      ('b>', 'b&gt;'), 
+      ('b>', 'b>'), 
       ('<script>', '&lt;script&gt;'), 
       ('<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>', 
        '<a href="http://blablabla.com/index.php?caca=1&amp;mierda=2">bla bla</a>'), 
@@ -145,21 +161,10 @@
       ('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca">', 
        '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" alt="caca" />'), 
       ('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca"></img>', 
-       '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" alt="caca" />')
+       '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" alt="caca" />'),
+      ('&amp;', '&amp;'), 
   )
   
-  correct_html2 = (
-      '&amp;', 
-      '<b>caca</b>', 
-      'b>', 
-      '<script>', 
-      '<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>', 
-      '<a href="http://blablabla.com/index.php?caca=1&mierda=2" title="caca">bla bla </a>', 
-      '<img src="http://blablabla.com/index.php?caca=1&mierda=2" />', 
-      '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />', 
-      '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />', 
-  )
-
   incorrect_html = (
       '<b>', 
       '<b', 
@@ -186,19 +191,6 @@
       '<IMG """><SCRIPT>alert("XSS")</SCRIPT>">', 
 )
 
-  def test_NoBBHTML(self):
-    """Checks if the HTML is escaped when it is not allowed."""
-    html_list = []
-    import globales
-    for code in self.correct_html2:
-      html_list.append(code)
-      retval, result = parse(code, False)
-      assert not retval
-      self.assertEqual(escape(code), result)
-    retval, result = parse("".join(html_list), False)
-    assert not retval
-    self.assertEqual(escape("".join(html_list)), result)
-
   def testCorrectHTML(self):
     """Checks if the conversion goes well."""
     html_list = []
@@ -208,13 +200,11 @@
       html_list.append(html)
       parsed_html_list.append(parsed_html)
       assert not retval
-      self.assertEqual(parsed_html, result)
-      retval, result = parse(html)
-      assert not retval
-      self.assertEqual(parsed_html, result)
+      self.assertEqual(parsed_html, "".join(text for dummy, text in result))
+
     retval, result = parse("".join(html_list))
     assert not retval
-    self.assertEqual("".join(parsed_html_list), result)
+    self.assertEqual("".join(parsed_html_list), "".join(text for dummy, text in result))
 
   def testIncorrectHTML(self):
     """Checks if the conversion fails w/ bad formed HTML."""
@@ -229,7 +219,7 @@
     for xss in self.xss:
       retval, result = parse(xss)
 #      print "-->", xss
-#      print "<--", retval, result
+#      print "<--", retval, "".join(text for dummy, text in result) if result else ""
 #      assert retval
 
 if __name__ == "__main__":

Modified: trunk/frontend-web/parsers.py
==============================================================================
--- trunk/frontend-web/parsers.py	(original)
+++ trunk/frontend-web/parsers.py	Sat Jun 30 22:30:03 2007
@@ -7,16 +7,34 @@
 def convert_to_html(text):
   """Converts the text into valid HTML"""
   from globales import allow_bbcode, allow_html
+  
+  tokens = [(True, text)]
 
-  retval, text = parser_html.parse(text, allow_html)
-  if retval:
-    return retval, None
+  if allow_html:
+    aux = []
+    for token in tokens:
+      if token[0]:
+        retval, token = parser_html.parse(token[1])
+        if retval:
+          return retval, None
+        aux += token
+      else:
+        aux.append(token)
+    tokens = aux[:]
 
-  retval,text=parser_bbcode.parse(text, allow_bbcode)
-  if retval :
-    return retval,None
+  if allow_bbcode:
+    aux = []
+    for token in tokens:
+      if token[0]:
+        retval, token=parser_bbcode.parse(token[1])
+        if retval:
+          return retval, None
+        aux += token
+      else:
+        aux.append(token)
+    tokens = aux[:]
 
-  return parser_eol.parse(text)
+  return parser_eol.parse("".join([text for dummy, text in tokens]))
 
 import unittest
 class TestBBCodeHTML(unittest.TestCase):
@@ -25,9 +43,8 @@
       ('[url="http://example.org/uno&dos"]el texto[/url]', '<a href="http://example.org/uno&amp;dos">el texto</a>'), 
       ('[url="http://example.org/uno&dos"]el & texto[/url]', '<a href="http://example.org/uno&amp;dos">el &amp; texto</a>'), 
       ('[url="http://example.org/uno&dos"]el &amp; texto[/url]', '<a href="http://example.org/uno&amp;dos">el &amp;amp; texto</a>'), 
-      ('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&amp;dos">el "" texto</a>'), 
+      ('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&amp;dos">el &quot;&quot; texto</a>'), 
 )
-
 
   def testCorrectBBCode(self):
     """Test if the translation goes well."""