[cpif] r208 - trunk/frontend-web
svn at argo.es
svn at argo.es
Sat Jun 30 22:30:04 CEST 2007
Author: alvaro
Date: Sat Jun 30 22:30:03 2007
New Revision: 208
Log:
Nuevo parser, que deberia solucionar todos los problemas anteriores.
Modified:
trunk/frontend-web/parser_bbcode.py
trunk/frontend-web/parser_html.py
trunk/frontend-web/parsers.py
Modified: trunk/frontend-web/parser_bbcode.py
==============================================================================
--- trunk/frontend-web/parser_bbcode.py (original)
+++ trunk/frontend-web/parser_bbcode.py Sat Jun 30 22:30:03 2007
@@ -29,7 +29,21 @@
"img": ('<img src="%(arg)s" />', None, True)
}
-from parser_html import escape, unescape
+def escape(text):
+ """Escape the conflictive characters into its entities. Please note that
+he "&" character must be traslated into "&" in the URLS, as said
+in http://www.htmlhelp.com/tools/validator/problems.html#amp"""
+
+ entities = {
+ '"': '"',
+ "'": "'"
+ }
+# Si hay que hacer otras sustituciones este codigo puede ser util
+# import re
+# pat = "(%s)" % "|".join( map(re.escape, entities.keys()) )
+# return re.sub( pat, lambda m:entities[m.group()], text )
+ import xml.sax.saxutils
+ return xml.sax.saxutils.escape(text, entities)
class BBCodeParser:
@@ -40,14 +54,15 @@
_regexp_start = "\%s.*?\%s" % (_open, _close)
_regexp_end = "\%s(.*?)\%s" % (_open, _close)
- def __init__(self, allow = True):
- self._allow = allow
+ def __init__(self):
+ self._status = ""
self._text = ""
self._tokens = []
self._tags = []
self._parsed = []
def _reset(self):
"""Resets the internal arrays."""
+ self._status = ""
self._tokens = []
self._tags = []
self._parsed = []
@@ -55,6 +70,15 @@
def feed(self, text):
"""Sets the text to parse"""
self._text = text
+ self._reset()
+ self._tokenize()
+ self._status = self._parse()
+
+ def get_text(self):
+ return self._status, "".join([text for text, dummy in self._parsed])
+
+ def get_tokens(self):
+ return self._status, self._parsed
def _tokenize(self):
"""Tokenize the string and the tags in two separates lists
@@ -72,7 +96,9 @@
aux = allowed_tags[tag[0]]
if not tag[1]:
aux = allowed_tags.get(tag[0] + "*", aux) # Permitimos tag* que no lleva argumentos y puede ser distinto
- tag[1] = unescape(tag[1].strip('"')) if tag[1] else ""
+ if tag[1]:
+ tag[1] = tag[1].strip('"')
+ tag[1] = tag[1].strip("'")
return aux[0] % {"arg": escape(tag[1]) if tag[1] else '' }
def _tag_closes(self, tag):
@@ -91,25 +117,21 @@
"""Do the parsing.
Should be called after _reset, and _tokenize."""
stack = []
- self._parsed.append(self._tokens.pop(0))
+ self._parsed.append((True, escape(self._tokens.pop(0))))
while self._tags:
tag = self._tags.pop(0)
- if not self._allow:
- self._parsed.append("%s%s%s"%(self._open, escape(tag), self._close))
- self._parsed.append(self._tokens.pop(0))
- continue
tag = tag.split("=", 1)
tag[0] = tag[0].strip()
if tag[0].startswith("/") and tag[0][1:] in allowed_tags.keys() and stack:
if tag[0][1:] == stack[-1]:
- self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+ self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
elif not allow_errors:
return "Error, tratando de cerrar %s%s%s no abierto antes" % (self._open, tag[0], self._close)
else: # Si alguien activa allow_errors mereceria la mas lenta y dolorosa muerte
# Esto es una chapuza para usuarios estupidos y chapuceros
- self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+ self._parsed.append((False, "%s%s%s" % (self._open, tag[0], self._close)))
if self._tokens and self._tag_closes(tag):
- self._parsed.append(self._tokens.pop(0))
+ self._parsed.append((True, escape(self._tokens.pop(0))))
elif tag[0] in allowed_tags.keys():
tag.append(None)
@@ -118,39 +140,29 @@
tag[1] = self._tokens[0]
if not allow_errors and not tag[1]:
return "Error, no se ha pasado el argumento a %s%s%s" % (self._open, tag[0], self._close)
- self._parsed.append(self._tag_to_html(tag))
+ self._parsed.append((False, self._tag_to_html(tag)))
if not self._tag_closes(tag) and self._tokens:
self._tokens.pop(0)
if self._tokens:
- self._parsed.append(self._tokens.pop(0))
+ self._parsed.append((True, escape(self._tokens.pop(0))))
else: # Si no conozco el tag se lo paso tal cual
- self._parsed.append("%s%s%s" % (self._open, tag[0], self._close))
+ self._parsed.append((True, "%s%s%s" % (self._open, tag[0], self._close)))
if self._tokens:
- self._parsed.append(self._tokens.pop(0))
+ self._parsed.append((True, escape(self._tokens.pop(0))))
if stack and allow_errors:
while stack: # No me responsabilizo de lo que salga aqui...
- self._parsed.append(self._close_tag_to_html(stack.pop(-1)))
+ self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
elif stack:
return "Error, falta cerrar el tag %s%s%s" % (self._open, stack[-1], self._close)
return False
- def parse(self):
- """Parses the string"""
- self._reset()
- self._tokenize()
- e = self._parse()
- if not e:
- return e, "".join(self._parsed)
- return e or "Error, HTML no valido", None
-
-def parse(text, allow = True):
- """Parses the text w/ BBCode into HTML. The allow argument is optional and
-tells the parser if the BBCode should be translated or no."""
- parser = BBCodeParser(allow)
+def parse(text):
+ """Parses the text w/ BBCode into HTML."""
+ parser = BBCodeParser()
parser.feed(text)
- return parser.parse()
+ return parser.get_tokens()
import unittest
class TestBBCode(unittest.TestCase):
@@ -235,62 +247,23 @@
html_list = []
global allow_errors
allow_errors = False
-# print
for bbcode, html in self.correct_bbcode:
bbcode_list.append(bbcode)
html_list.append(html)
retval, result = parse(bbcode)
-# print "-->", bbcode
-# print "<--", html
-# print "<--", result
-# print
- assert not retval
- self.assertEqual(html, result)
- retval, result = parse(bbcode)
assert not retval
- self.assertEqual(html, result)
+ self.assertEqual(html, "".join(text for dummy, text in result))
retval, result = parse("".join(bbcode_list))
assert not retval
- self.assertEqual("".join(html_list), result)
+ self.assertEqual("".join(html_list), "".join(text for dummy, text in result))
- def test_NoBBCode(self):
- """Test if the translation goes well in the case that we do not allow BBCode."""
- list = []
- global allow_errors
- allow_errors = False
- import globales
- globales.allow_bbcode = False
- globales.allow_html = True
- for code, dummy in self.correct_bbcode:
- list.append(code)
- retval, result = parse(code, False)
- assert not retval
- self.assertEqual(escape(code), result)
- retval, result = parse("".join(list), False)
- assert not retval
- self.assertEqual(escape("".join(list)), result)
-
def testIncorrectBBCode(self):
"""Test if the translation fails using bad formed BBCode."""
global allow_errors
allow_errors = False
for bbcode in self.incorrect_bbcode:
retval, result = parse(bbcode)
-# print "-->", bbcode
-# print "<--", retval, result
assert retval
- retval, result = parse(bbcode)
- assert retval
-
- def testIncorrectXSS(self):
- """Test if the code is XSS safe. Uncomment the print statents"""
- global allow_errors
- allow_errors = False
- for xss in self.xss:
- retval, result = parse(xss)
-# print "-->", xss
-# print "<--", retval, result
-# assert retval
if __name__ == "__main__":
unittest.main()
Modified: trunk/frontend-web/parser_html.py
==============================================================================
--- trunk/frontend-web/parser_html.py (original)
+++ trunk/frontend-web/parser_html.py Sat Jun 30 22:30:03 2007
@@ -12,6 +12,9 @@
}
def escape(text):
+ """Escape the conflictive characters into its entities. Please note that
+he "&" character must be traslated into "&" in the URLS, as said
+in http://www.htmlhelp.com/tools/validator/problems.html#amp"""
entities = {
# '<':'<',
# '>':'>',
@@ -26,32 +29,17 @@
import xml.sax.saxutils
return xml.sax.saxutils.escape(text, entities)
-def unescape(text):
- entities = {
-# '<':'<',
-# '>':'>',
- '"': '"',
- "'": "'"
-# "&": "&"
- }
- import xml.sax.saxutils
- return xml.sax.saxutils.unescape(text, entities)
-
from sgmllib import SGMLParser, SGMLParseError
class HTMLParser(SGMLParser):
- def __init__(self, allow=True):
- self.allow = allow
+ def __init__(self):
SGMLParser.__init__(self)
def reset(self):
- self.text = []
+ self.entity = False
+ self.tokens = []
self._stack = []
SGMLParser.reset(self)
def do_img(self, attrs):
- if not self.allow:
- strattrs = "".join([' %s="%s"' % (a, v) for a, v in attrs])
- self.text.append(escape("<img%(strattrs)s />" % locals()))
- return
if not attrs: raise SGMLParseError, "Error, argumentos para <img> no validos"
attrs_list = []
for a, v in attrs:
@@ -60,81 +48,109 @@
else:
raise SGMLParseError, "Error, argumentos para <img> no validos"
strattrs = "".join(attrs_list)
- self.text.append("<img%(strattrs)s />" % locals())
+ text = "<img%(strattrs)s />" % locals()
+ self.tokens.append((False, text))
def end_img(self):
pass
def do_br(self):
- if not self.allow:
- self.text.append(escape("<br />" % locals()))
- return
- self.text.append("<br />")
+ self.tokens.append((False, "<br />"))
def end_br(self):
pass
def unknown_starttag(self, tag, attrs):
- if tag in allowed_html.keys() and self.allow:
+ if tag in allowed_html.keys():
attrs_list = []
for a, v in attrs:
- if a in allowed_html[tag] and v and self.allow:
+ if a in allowed_html[tag] and v:
attrs_list.append(' %s="%s"' % (a, escape(v)))
else:
raise SGMLParseError, "Error, argumentos para <%s> no validos" % tag
strattrs = "".join(attrs_list)
- self.text.append("<%(tag)s%(strattrs)s>" % locals())
+ text = "<%(tag)s%(strattrs)s>" % locals()
+ self.tokens.append((False, text))
self._stack.append(tag)
else:
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
- self.text.append(escape("<%(tag)s%(strattrs)s>" % locals()))
+ text = escape("<%(tag)s%(strattrs)s>" % locals())
+ self.tokens.append((False, text)) # XXX
def unknown_endtag(self, tag):
- if tag in allowed_html.keys() and self.allow:
- self.text.append("</%(tag)s>" % locals())
+ if tag in allowed_html.keys():
+ text = "</%(tag)s>" % locals()
+ self.tokens.append((False, text))
if self._stack and tag == self._stack[-1]:
self._stack.pop(-1)
else:
raise SGMLParseError, "Error, tag <%s> sin abrir" % tag
else:
- self.text.append(escape("</%(tag)s>" % locals()))
+ text = escape("</%(tag)s>" % locals())
+ self.tokens.append((False, text)) # XXX
def handle_charref(self, ref):
- self.text.append("&#%(ref)s;" % locals())
-
- def handle_entityref(self, ref):
- self.text.append("&%(ref)s" % locals())
+ self.entity = True
+ if not self.tokens: self.tokens.append((True, ""))
+ text = "%s&#%s;" % (self.tokens[-1][1], ref)
+ old_status = self.tokens[-1][0]
+ self.tokens[-1] = (old_status, text) # FIXME
+
+ def handle_entityref(self, ref): # FIXME
+ self.entity = True
+ if not self.tokens: self.tokens.append((True, ""))
+ text = "%s&%s" % (self.tokens[-1][1], ref)
import htmlentitydefs
if htmlentitydefs.entitydefs.has_key(ref):
- self.text.append(";")
+ text+=";"
+ old_status = self.tokens[-1][0]
+ self.tokens[-1] = (old_status, text) # FIXME
def handle_data(self, text):
import xml.sax.saxutils
- self.text.append(xml.sax.saxutils.escape(text))
+ if self.entity:
+ self.entity = False
+ if not self.tokens: self.tokens.append((True, ""))
+ old_status = self.tokens[-1][0]
+ text = "%s%s" % (self.tokens[-1][1], text)
+ self.tokens[-1] = (old_status, text)
+ elif text.startswith("&"):
+ self.entity = True
+ if not self.tokens: self.tokens.append((True, ""))
+ old_status = self.tokens[-1][0]
+ text = "%s%s" % (self.tokens[-1][1], text)
+ self.tokens[-1] = (old_status, text)
+ else:
+ self.tokens.append((True, text))
def get_text(self):
"""Return processed HTML as a single string"""
if self._stack:
raise SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
- return "".join(self.text)
+ return "".join([text for dummy, text in self.tokens])
-def parse(text, allow = True):
- parser = HTMLParser(allow)
+ def get_tokens(self):
+ if self._stack:
+ raise SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
+ return self.tokens
+
+
+def parse(text):
+ parser = HTMLParser()
parser.reset()
try:
parser.feed(text)
- text = parser.get_text()
- if not text: raise SGMLParseError, "HTML no valido"
+ tokens = parser.get_tokens()
+ if not tokens: raise SGMLParseError, "HTML no valido"
except SGMLParseError, e:
return (e or "HTML no valido", None)
- return False, text
+ return False, tokens
import unittest
class TestHTML(unittest.TestCase):
correct_html = (
- ('&', '&amp;'),
('<b>caca</b>', '<b>caca</b>'),
- ('b>', 'b>'),
+ ('b>', 'b>'),
('<script>', '<script>'),
('<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>',
'<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>'),
@@ -145,21 +161,10 @@
('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca">',
'<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />'),
('<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca"></img>',
- '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />')
+ '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />'),
+ ('&', '&'),
)
- correct_html2 = (
- '&',
- '<b>caca</b>',
- 'b>',
- '<script>',
- '<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla</a>',
- '<a href="http://blablabla.com/index.php?caca=1&mierda=2" title="caca">bla bla </a>',
- '<img src="http://blablabla.com/index.php?caca=1&mierda=2" />',
- '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />',
- '<img src="http://blablabla.com/index.php?caca=1&mierda=2" alt="caca" />',
- )
-
incorrect_html = (
'<b>',
'<b',
@@ -186,19 +191,6 @@
'<IMG """><SCRIPT>alert("XSS")</SCRIPT>">',
)
- def test_NoBBHTML(self):
- """Checks if the HTML is escaped when it is not allowed."""
- html_list = []
- import globales
- for code in self.correct_html2:
- html_list.append(code)
- retval, result = parse(code, False)
- assert not retval
- self.assertEqual(escape(code), result)
- retval, result = parse("".join(html_list), False)
- assert not retval
- self.assertEqual(escape("".join(html_list)), result)
-
def testCorrectHTML(self):
"""Checks if the conversion goes well."""
html_list = []
@@ -208,13 +200,11 @@
html_list.append(html)
parsed_html_list.append(parsed_html)
assert not retval
- self.assertEqual(parsed_html, result)
- retval, result = parse(html)
- assert not retval
- self.assertEqual(parsed_html, result)
+ self.assertEqual(parsed_html, "".join(text for dummy, text in result))
+
retval, result = parse("".join(html_list))
assert not retval
- self.assertEqual("".join(parsed_html_list), result)
+ self.assertEqual("".join(parsed_html_list), "".join(text for dummy, text in result))
def testIncorrectHTML(self):
"""Checks if the conversion fails w/ bad formed HTML."""
@@ -229,7 +219,7 @@
for xss in self.xss:
retval, result = parse(xss)
# print "-->", xss
-# print "<--", retval, result
+# print "<--", retval, "".join(text for dummy, text in result) if result else ""
# assert retval
if __name__ == "__main__":
Modified: trunk/frontend-web/parsers.py
==============================================================================
--- trunk/frontend-web/parsers.py (original)
+++ trunk/frontend-web/parsers.py Sat Jun 30 22:30:03 2007
@@ -7,16 +7,34 @@
def convert_to_html(text):
"""Converts the text into valid HTML"""
from globales import allow_bbcode, allow_html
+
+ tokens = [(True, text)]
- retval, text = parser_html.parse(text, allow_html)
- if retval:
- return retval, None
+ if allow_html:
+ aux = []
+ for token in tokens:
+ if token[0]:
+ retval, token = parser_html.parse(token[1])
+ if retval:
+ return retval, None
+ aux += token
+ else:
+ aux.append(token)
+ tokens = aux[:]
- retval,text=parser_bbcode.parse(text, allow_bbcode)
- if retval :
- return retval,None
+ if allow_bbcode:
+ aux = []
+ for token in tokens:
+ if token[0]:
+ retval, token=parser_bbcode.parse(token[1])
+ if retval:
+ return retval, None
+ aux += token
+ else:
+ aux.append(token)
+ tokens = aux[:]
- return parser_eol.parse(text)
+ return parser_eol.parse("".join([text for dummy, text in tokens]))
import unittest
class TestBBCodeHTML(unittest.TestCase):
@@ -25,9 +43,8 @@
('[url="http://example.org/uno&dos"]el texto[/url]', '<a href="http://example.org/uno&dos">el texto</a>'),
('[url="http://example.org/uno&dos"]el & texto[/url]', '<a href="http://example.org/uno&dos">el & texto</a>'),
('[url="http://example.org/uno&dos"]el & texto[/url]', '<a href="http://example.org/uno&dos">el &amp; texto</a>'),
- ('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&dos">el "" texto</a>'),
+ ('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&dos">el "" texto</a>'),
)
-
def testCorrectBBCode(self):
"""Test if the translation goes well."""
More information about the cpif
mailing list