[cpif] r193 - trunk/frontend-web

Fri Jun 29 00:26:06 CEST 2007

Author: alvaro
Date: Fri Jun 29 00:26:04 2007
New Revision: 193

Log:
Primera version de un parser BBcode -> HTML

Para utilizarlo se debe instanciar y pasarle un texto
con el metodo feed(texto). Posteriormente, con parse() 
se realiza el parseo. El metodo parse devuelve dos valores,
el primero es el codigo de retorno. Si es cero, todo ha ido
bien, si es distinto de cero, indica el error. El segundo valor
es el texto parseado.

Se incluyen unittests.


Added:
   trunk/frontend-web/parser_bbcode.py   (contents, props changed)

Added: trunk/frontend-web/parser_bbcode.py
==============================================================================

--- (empty file)
+++ trunk/frontend-web/parser_bbcode.py	Fri Jun 29 00:26:04 2007
@@ -0,0 +1,425 @@
+#!/bin/python2.5
+
+# $Id$
+
+# Allow BBCode syntax errors?
+allow_errors = True
+allow_errors = False
+
+allowed_html = {
+    "a": ["href", "title"],
+    "p": [],
+    "b": [],
+    "s": [],
+    "i": [],
+    "em": [],
+    "strong": [],
+    "img": ["src", "alt", "title"]
+}
+
+allowed_tags = {
+    "b": ('<strong>', '</strong>', False),
+    "u": ('<span style="text-decoration:underline">', '</span>', False),
+    "s": ('<strike>', '</s>', False),
+    "size": ('<span style="font-size:%(arg)s">', '</span>', True),
+    "i": ('<em>', '</em>', False),
+    "url": ('<a href="%(arg)s">', '</a>', True), 
+    "quote*": ('<blockquote>', '</blockquote>', False), 
+    "quote": ('<blockquote><h4>%(arg)s escribi&oacute;:</h4>', '</blockquote>', True), 
+    "img": ('<img src="%(arg)s" />', None, True)
+}
+
+class BBCodeParser:
+  """A BBCode to HTML parser"""
+
+  _open = "["
+  _close = "]"
+
+  def __init__(self):
+    self._text = ""
+    self._preparsed = ""
+    self._tokens = []
+    self._tags = []
+    self._parsed = []
+
+  def _reset(self):
+    """Resets the internal arrays."""
+    self._preparsed = ""
+    self._tokens = []
+    self._tags = []
+    self._parsed = []
+
+  def feed(self, text):
+    """Sets the text to parse"""
+    self._text = text
+
+  def _tokenize(self):
+    """Tokenize the string and the tags in two separates lists
+    Should be called after _preparse but before _parse."""
+    import re
+    self._tokens = re.split("\%s.*?\%s" % (self._open, self._close), self._preparsed)
+    self._tags = re.findall("\%s(.*?)\%s" % (self._open, self._close), self._preparsed)
+
+  def _close_tag_to_html(self, tag):
+    """Translate the BBCode tag to HTML anc close it."""
+    return allowed_tags[tag][1] or ""
+
+  def _tag_to_html(self, tag):
+    """Translates the BBCode tag to HTML"""
+    aux = allowed_tags[tag[0]]
+    if not tag[1]:
+      aux = allowed_tags.get(tag[0] + "*", aux) # Permitimos tag* que no lleva argumentos y puede ser distinto
+    return aux[0] % {"arg": tag[1].strip('"') if tag[1] else '' }
+
+  def _tag_closes(self, tag):
+    """Returns true if the tags needs to be closed"""
+    if tag[0].startswith("/"):
+      return allowed_tags[tag[0][1:]][1]
+    return allowed_tags[tag[0]][1]
+
+  def _tag_needs_args(self, tag):
+    """Returns true if the tag needs arguments"""
+    if allowed_tags.get(tag + "*", False):
+      return False
+    return allowed_tags[tag][2]
+  
+  def _preparse(self):
+    """Preparse the text, just leaving the allowed html tags. Escape dangerous sequences.
+    Should be called before _parse"""
+
+    import sgmllib
+    class PreParseHTML(sgmllib.SGMLParser):
+      self._stack = []
+      def reset(self):                       
+        self.text = []
+        self._stack = []
+        sgmllib.SGMLParser.reset(self)
+
+      def escape(self, text):
+        entities = {'"': '&quot;', "'": "&apos;"}
+        import xml.sax.saxutils
+        return xml.sax.saxutils.escape(text, entities)
+
+      def do_img(self, attrs):
+        if not attrs: raise sgmllib.SGMLParseError, "Error, argumentos para <img> no validos"
+        attrs_list = []
+        for a, v in attrs:
+          if a in allowed_html["img"] and v:
+            attrs_list.append(' %s="%s"' % (a, self.escape(v)))
+          else:
+            raise sgmllib.SGMLParseError, "Error, argumentos para <img> no validos"
+        strattrs = "".join(attrs_list)
+        self.text.append("<img%(strattrs)s />" % locals())
+
+      def end_img(self):
+        pass
+
+      def do_br(self):
+        self.text.append("<br />")
+
+      def end_br(self):
+        pass
+
+      def unknown_starttag(self, tag, attrs):
+        if tag in allowed_html.keys():
+          attrs_list = []
+          for a, v in attrs:
+            if a in allowed_html[tag] and v:
+              attrs_list.append(' %s="%s"' % (a, self.escape(v)))
+            else:
+              raise sgmllib.SGMLParseError, "Error, argumentos para <%s> no validos" % tag
+          strattrs = "".join(attrs_list)
+          self.text.append("<%(tag)s%(strattrs)s>" % locals())
+          self._stack.append(tag)
+        else:
+          strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
+          self.text.append(self.escape("<%(tag)s%(strattrs)s>" % locals()))
+
+      def unknown_endtag(self, tag):         
+        if tag in allowed_html.keys():
+          self.text.append("</%(tag)s>" % locals())
+          if self._stack and tag == self._stack[-1]:
+            self._stack.pop(-1)
+          else:
+            raise sgmllib.SGMLParseError, "Error, tag <%s> sin abrir" % tag
+        else:
+          self.text.append(self.escape("</%(tag)s>" % locals()))
+
+      def handle_charref(self, ref):         
+        self.text.append("&amp;#%(ref)s;" % locals())
+
+      def handle_entityref(self, ref):      
+        self.text.append("&amp;%(ref)s" % locals())
+        import htmlentitydefs
+        if htmlentitydefs.entitydefs.has_key(ref):
+          self.text.append(";")
+
+      def handle_data(self, text):           
+        import xml.sax.saxutils
+        self.text.append(xml.sax.saxutils.escape(text))
+
+      def get_text(self):              
+        """Return processed HTML as a single string"""
+        if self._stack: 
+          raise sgmllib.SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
+        return "".join(self.text)
+
+    self._text = self._text.replace("javascript","javascropt") # FIXME: necesario?
+    html_parser = PreParseHTML()
+    try:
+      html_parser.feed(self._text)
+      self._preparsed = html_parser.get_text()
+    except sgmllib.SGMLParseError, e:
+      return e
+    
+  def _parse(self):
+    """Do the parsing.
+    Should be called after _reset, _preparse and _tokenize."""
+    stack = []
+    self._parsed.append(self._tokens.pop(0))
+    while self._tags:
+      tag = self._tags.pop(0)
+      tag = tag.split("=", 1)
+      if tag[0].startswith("/") and tag[0][1:] in allowed_tags.keys() and stack:
+        if tag[0][1:] == stack[-1]:
+          self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+        elif not allow_errors: 
+          return "Error, tratando de cerrar %s%s%s no abierto antes" % (self._open, tag[0], self._close)
+        else: # Si alguien activa allow_errors mereceria la mas lenta y dolorosa muerte
+              # Esto es una chapuza para usuarios estupidos y chapuceros
+          self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+        if self._tokens and self._tag_closes(tag): self._parsed.append(self._tokens.pop(0))
+
+      elif tag[0] in allowed_tags.keys():
+        tag.append(None)
+        stack.append(tag[0])
+        if self._tag_needs_args(tag[0]) and tag[1] == None: 
+          tag[1] = self._tokens[0]
+          if not allow_errors and not tag[1]:
+            return "Error, no se ha pasado el argumento a %s%s%s" % (self._open, tag[0], self._close)
+        self._parsed.append(self._tag_to_html(tag))
+        if not self._tag_closes(tag) and self._tokens:
+          self._tokens.pop(0)
+        if self._tokens: self._parsed.append(self._tokens.pop(0))
+
+      else: # Si no conozco el tag se lo paso tal cual
+        self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+        if self._tokens: self._parsed.append(self._tokens.pop(0))
+    
+    if stack and allow_errors:
+      while stack: # No me responsabilizo de lo que salga aqui...
+        self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+    elif stack:
+      return "Error, falta por cerrar el tag %s%s%s" % (self._open, stack[-1], self._close)
+    return 0
+
+  def parse(self):
+    """Parses the string"""
+    self._reset()
+    e = self._preparse()
+    if not e and self._preparsed:
+      self._tokenize()
+      e =  self._parse()
+      if not e:
+        return e, "".join(self._parsed)
+    return e or "Error, HTML no valido", None
+
+
+import unittest
+class TestBBCode(unittest.TestCase):
+  correct_bbcode = (
+      ('[b]hola[/b]', '<strong>hola</strong>'),
+      ('antes [b]hola[/b]', 'antes <strong>hola</strong>'),
+      ('[b]hola[/b] despues', '<strong>hola</strong> despues'),
+      ('antes [b]hola[/b] despues', 'antes <strong>hola</strong> despues'),
+
+      ('[url]la direccion[/url]', '<a href="la direccion">la direccion</a>'),
+      ('antes [url]la direccion[/url]', 'antes <a href="la direccion">la direccion</a>'),
+      ('[url]la direccion[/url] despues', '<a href="la direccion">la direccion</a> despues'),
+      ('antes [url]la direccion[/url] despues', 'antes <a href="la direccion">la direccion</a> despues'),
+
+      ('[url="la direccion"]el texto[/url]', '<a href="la direccion">el texto</a>'),
+      ('antes [url="la direccion"]el texto[/url]', 'antes <a href="la direccion">el texto</a>'),
+      ('[url="la direccion"]el texto[/url] despues', '<a href="la direccion">el texto</a> despues'),
+      ('antes [url="la direccion"]el texto[/url] despues', 'antes <a href="la direccion">el texto</a> despues'),
+
+      ('[url=la direccion]el texto[/url]', '<a href="la direccion">el texto</a>'),
+      ('antes [url=la direccion]el texto[/url]', 'antes <a href="la direccion">el texto</a>'),
+      ('[url=la direccion]el texto[/url] despues', '<a href="la direccion">el texto</a> despues'),
+      ('antes [url=la direccion]el texto[/url] despues', 'antes <a href="la direccion">el texto</a> despues'),
+
+      ('[quote=paco]Hola[/quote]', '<blockquote><h4>paco escribi&oacute;:</h4>Hola</blockquote>'),
+      ('antes [quote=paco]Hola[/quote]', 'antes <blockquote><h4>paco escribi&oacute;:</h4>Hola</blockquote>'),
+      ('[quote=paco]Hola[/quote] despues', '<blockquote><h4>paco escribi&oacute;:</h4>Hola</blockquote> despues'),
+      ('antes [quote=paco]Hola[/quote] despues', 'antes <blockquote><h4>paco escribi&oacute;:</h4>Hola</blockquote> despues'),
+
+      ('[quote]Hola[/quote]', '<blockquote>Hola</blockquote>'),
+      ('antes [quote]Hola[/quote]', 'antes <blockquote>Hola</blockquote>'),
+      ('[quote]Hola[/quote] despues', '<blockquote>Hola</blockquote> despues'),
+      ('antes [quote]Hola[/quote] despues', 'antes <blockquote>Hola</blockquote> despues'),
+  
+      ('[u]hola[/u]', '<span style="text-decoration:underline">hola</span>'),
+      
+      ('[size=20px]hola[/size]', '<span style="font-size:20px">hola</span>'),
+
+      ('[n]hola[/n]', '[n]hola[/n]'),
+
+      ("[img]laimagen1[/img]", '<img src="laimagen1" />'),
+      ("antes [img]laimagen2[/img]", 'antes <img src="laimagen2" />'),
+      ("[img]laimagen3[/img] despues", '<img src="laimagen3" /> despues'),
+      ("antes [img]laimagen4[/img] despues", 'antes <img src="laimagen4" /> despues')
+  )
+
+  correct_html = (
+      ('&amp;', '&amp;amp;'),
+      ('<b>caca</b>', '<b>caca</b>'),
+      ('b>', 'b&gt;'),
+      ('<script>', '&lt;script&gt;'),
+      ('<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>',
+       '<a href="http://blablabla.com/index.php?caca=1&amp;mierda=2">bla bla</a>'),
+      ('<a href="http://blablabla.com/index.php?caca=1&mierda=2" title="caca">bla bla </a>',
+       '<a href="http://blablabla.com/index.php?caca=1&amp;mierda=2" title="caca">bla bla </a>'),
+      ('<img src="http://blablabla.com/index.php?caca=1&mierda=2" />',
+       '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" />'),
+      ('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca">',
+       '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" alt="caca" />'),
+      ('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca"></img>',
+       '<img src="http://blablabla.com/index.php?caca=1&amp;mierda=2" alt="caca" />')
+  )
+
+  incorrect_html = (
+      '<b>',
+      '<b',
+      '</b>',
+      '<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla',
+      '<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla </as>',
+      '<a href="http://blablabla.com/index.php?caca=1&mierda=2" taitle="caca">bla bla </a>',
+      '<a href="http://blablabla.com/index.php?caca=1&mierda=2" title="caca" de la vaca>bla bla <a>',
+      '<img sorcerer="http://blablabla.com/index.php?caca=1&mierda=2" />',
+      '<img src="http://blablabla.com/index.php?caca=1&mierda=2" altibajo="caca">',
+  )
+
+  incorrect_bbcode = (
+      '[i]bla bla bla [b]hola[/n][/i]',
+      '[b]Ay, se me ha olvidado cerrar',
+      '[url][/url]',
+      '[b][/url]hola[/url]',
+      '[b]caca[url]hola[/b][/url]',
+      """Un texto [b]mas[/b] largo, con [b]negritas y [n]cursivas[/i]
+       [/b] e incluso [url="laurl"]enlaces [/b]con negritas[/b] dentro[/url]""")
+
+  xss = (
+      """'';!--"<XSS>=&{()}""",
+      """<?pi ?>""",
+      """<?php ?>""",
+      """';alert(String.fromCharCode(88,83,83))//\';alert(String.fromCharCode(88,83,83))//";alert(String.fromCharCode(88,83,83))//\";alert(String.fromCharCode(88,83,83))//--></SCRIPT>">'><SCRIPT>alert(String.fromCharCode(88,83,83))</SCRIPT>""",
+      "<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
+      """<IMG SRC="javascript:alert('XSS');">""",
+      """<IMG SRC=javascript:alert('XSS')>""",
+      """<IMG SRC=JaVaScRiPt:alert('XSS')>""",
+      """<IMG SRC=javascript:alert(&quot;XSS&quot;)>""",
+      """<IMG SRC=`javascript:alert("RSnake says, 'XSS'")`>""",
+      '<IMG """><SCRIPT>alert("XSS")</SCRIPT>">',
+)
+
+
+  def testCorrectBBCode(self):
+    parser = BBCodeParser()
+    bbcode_list = []
+    html_list = []
+    global allow_errors
+    allow_errors = False
+    print
+    for bbcode, html in self.correct_bbcode:
+      parser.feed(bbcode)
+      bbcode_list.append(bbcode)
+      html_list.append(html)
+      retval, result = parser.parse()
+#      print "-->", bbcode
+#      print "<--", html
+#      print "<--", result
+#      print
+      assert not retval
+      self.assertEqual(html, result)
+      retval, result = parser.parse()
+      assert not retval
+      self.assertEqual(html, result)
+    parser.feed("".join(bbcode_list))
+    retval, result = parser.parse()
+    assert not retval
+    self.assertEqual("".join(html_list), result)
+    retval, result = parser.parse()
+    assert not retval
+    self.assertEqual("".join(html_list), result)
+  
+  def testCorrectHTML(self):
+    parser = BBCodeParser()
+    html_list = []
+    parsed_html_list = []
+    global allow_errors
+    allow_errors = False
+    print
+    for html, parsed_html in self.correct_html:
+      parser.feed(html)
+      html_list.append(html)
+      parsed_html_list.append(parsed_html)
+      retval, result = parser.parse()
+#      print "-->", html
+#      print "<--", parsed_html
+#      print "<--", result
+#      print
+      assert not retval
+      self.assertEqual(parsed_html, result)
+      retval, result = parser.parse()
+      assert not retval
+      self.assertEqual(parsed_html, result)
+    parser.feed("".join(html_list))
+    retval, result = parser.parse()
+    assert not retval
+    self.assertEqual("".join(parsed_html_list), result)
+    retval, result = parser.parse()
+    assert not retval
+    self.assertEqual("".join(parsed_html_list), result)
+
+  def testIncorrectBBCode(self):
+    parser = BBCodeParser()
+    global allow_errors
+    allow_errors = False
+    print
+    for bbcode in self.incorrect_bbcode:
+      parser.feed(bbcode)
+      retval, result = parser.parse()
+#      print "-->", bbcode
+#      print "<--", retval, result
+      assert retval
+      retval, result = parser.parse()
+      assert retval
+
+  def testIncorrectHTML(self):
+    parser = BBCodeParser()
+    global allow_errors
+    allow_errors = False
+    print
+    for html in self.incorrect_html:
+      parser.feed(html)
+      retval, result = parser.parse()
+#      print "-->", html
+#      print "<--", retval, result
+      assert retval
+      retval, result = parser.parse()
+      assert retval
+
+  def testIncorrectXSS(self):
+    parser = BBCodeParser()
+    global allow_errors
+    allow_errors = False
+    print
+    for xss in self.xss:
+      parser.feed(xss)
+      retval, result = parser.parse()
+      print "-->", xss
+      print "<--", retval, result
+#      assert retval
+
+if __name__ == "__main__":
+  unittest.main()