[cpif] r239 - trunk/frontend-web
svn at argo.es
svn at argo.es
Wed Jul 4 00:09:21 CEST 2007
Author: alvaro
Date: Wed Jul 4 00:09:20 2007
New Revision: 239
Log:
Parser de tags. Los parsers han cambiado la API. Pueden recibir un
parametro en el momento de la instanciacion, que les indica el contexto
en el que se encuentran. Es util para definir restricciones de anidamiento
de tags. Asimismo, los tokens que devuelven llevan este parametro:
(Debe ser parseado, texto, contexto).
Added:
trunk/frontend-web/parser_urls.py
- copied, changed from r235, /trunk/frontend-web/parser_eol.py
Modified:
trunk/frontend-web/parser_bbcode.py
trunk/frontend-web/parser_entities.py
trunk/frontend-web/parser_eol.py
trunk/frontend-web/parser_html.py
trunk/frontend-web/parsers.py
Modified: trunk/frontend-web/parser_bbcode.py
==============================================================================
--- trunk/frontend-web/parser_bbcode.py (original)
+++ trunk/frontend-web/parser_bbcode.py Wed Jul 4 00:09:20 2007
@@ -3,30 +3,19 @@
# Allow BBCode syntax errors?
-allow_errors = True
allow_errors = False
-allowed_html = {
- "a": ["href", "title"],
- "p": [],
- "b": [],
- "s": [],
- "i": [],
- "em": [],
- "strong": [],
- "img": ["src", "alt", "title"]
-}
-
allowed_tags = {
- "b": ('<strong>', '</strong>', False),
- "u": ('<span style="text-decoration:underline">', '</span>', False),
- "s": ('<strike>', '</strike>', False),
- "size": ('<span style="font-size:%(arg)s">', '</span>', True),
- "i": ('<em>', '</em>', False),
- "url": ('<a href="%(arg)s">', '</a>', True),
- "quote*": ('<blockquote>', '</blockquote>', False),
- "quote": ('<blockquote><h4>%(arg)s escribió:</h4>', '</blockquote>', True),
- "img": ('<img src="%(arg)s" />', None, True)
+# "bbcode": (start tag, end tag (if needed), True if allows arguments, context)
+ "b": ('<strong>', '</strong>', False, 'strong'),
+ "u": ('<span style="text-decoration:underline">', '</span>', False, 'span'),
+ "s": ('<strike>', '</strike>', False, 'strike'),
+ "size": ('<span style="font-size:%(arg)s">', '</span>', True, 'span'),
+ "i": ('<em>', '</em>', False, 'em'),
+ "url": ('<a href="%(arg)s">', '</a>', True, 'a'),
+ "quote*": ('<blockquote>', '</blockquote>', False, 'blockquote'),
+ "quote": ('<blockquote><h4>%(arg)s escribió:</h4>', '</blockquote>', True, 'blockquote'),
+ "img": ('<img src="%(arg)s" />', None, True, None)
}
def escape(text):
@@ -54,7 +43,8 @@
_regexp_start = "\%s.*?\%s" % (_open, _close)
_regexp_end = "\%s(.*?)\%s" % (_open, _close)
- def __init__(self):
+ def __init__(self, context = None):
+ self._context = context
self._status = ""
self._text = ""
self._tokens = []
@@ -116,53 +106,64 @@
def _parse(self):
"""Do the parsing.
Should be called after _reset, and _tokenize."""
+ def get_context():
+ return allowed_tags.get(stack[-1], None)[3] if stack else None
+ def check_context(tag):
+ if allowed_tags[tag][3] == self._context and self._context:
+ return False
+ else:
+ return True
stack = []
- self._parsed.append((True, self._tokens.pop(0)))
+ self._parsed.append((True, self._tokens.pop(0), None))
while self._tags:
tag = self._tags.pop(0)
tag = tag.split("=", 1)
tag[0] = tag[0].strip()
if tag[0].startswith("/") and tag[0][1:] in allowed_tags.keys() and stack:
if tag[0][1:] == stack[-1]:
- self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
+ self._parsed.append((False, self._close_tag_to_html(stack[-1]), get_context()))
+ stack.pop(-1)
elif not allow_errors:
return "Error, tratando de cerrar %s%s%s no abierto antes" % (self._open, tag[0], self._close)
else: # Si alguien activa allow_errors mereceria la mas lenta y dolorosa muerte
- # Esto es una chapuza para usuarios estupidos y chapuceros
- self._parsed.append((False, "%s%s%s" % (self._open, tag[0], self._close)))
+ # Esto es una chapuza para usuarios estupidos y chapuceros (y ni se si funciona)
+ self._parsed.append((False, "%s%s%s" % (self._open, tag[0], self._close), tag[0]))
if self._tokens and self._tag_closes(tag):
- self._parsed.append((True, self._tokens.pop(0)))
+ self._parsed.append((True, self._tokens.pop(0), get_context()))
elif tag[0] in allowed_tags.keys():
+ if not check_context(tag[0]):
+ return "Error, anidado no valido"
tag.append(None)
stack.append(tag[0])
if self._tag_needs_args(tag[0]) and tag[1] == None:
tag[1] = self._tokens[0]
if not allow_errors and not tag[1]:
return "Error, no se ha pasado el argumento a %s%s%s" % (self._open, tag[0], self._close)
- self._parsed.append((False, self._tag_to_html(tag)))
+ self._parsed.append((False, self._tag_to_html(tag), get_context()))
if not self._tag_closes(tag) and self._tokens:
self._tokens.pop(0)
if self._tokens:
- self._parsed.append((True, self._tokens.pop(0)))
+ self._parsed.append((True, self._tokens.pop(0), get_context()))
else: # Si no conozco el tag se lo paso tal cual
- self._parsed.append((True, "%s%s%s" % (self._open, tag[0], self._close)))
+ self._parsed.append((True, "%s%s%s" % (self._open, tag[0], self._close), get_context()))
if self._tokens:
- self._parsed.append((True, self._tokens.pop(0)))
+ self._parsed.append((True, self._tokens.pop(0), get_context()))
if stack and allow_errors:
while stack: # No me responsabilizo de lo que salga aqui...
- self._parsed.append((False, self._close_tag_to_html(stack.pop(-1))))
+ self._parsed.append((False, self._close_tag_to_html(stack[-1]), close_tag))
+ stack.pop(-1)
elif stack:
return "Error, falta cerrar el tag %s%s%s" % (self._open, stack[-1], self._close)
return False
-def parse(text):
+def parse(text, context = None):
"""Parses the text into BBCode. It returns the exit code and a list of tuples
in the form [(Boolean,"text")]. The first element tells whether "text" was
untouched or analysed."""
- parser = BBCodeParser()
+ parser = BBCodeParser(context = context)
parser.feed(text)
return parser.get_tokens()
@@ -254,10 +255,10 @@
html_list.append(html)
retval, result = parse(bbcode)
assert not retval
- self.assertEqual(html, "".join(text for dummy, text in result))
+ self.assertEqual(html, "".join(text for dummy, text, dummy in result))
retval, result = parse("".join(bbcode_list))
assert not retval
- self.assertEqual("".join(html_list), "".join(text for dummy, text in result))
+ self.assertEqual("".join(html_list), "".join(text for dummy, text, dummy in result))
def testIncorrectBBCode(self):
"""Test if the translation fails using bad formed BBCode."""
Modified: trunk/frontend-web/parser_entities.py
==============================================================================
--- trunk/frontend-web/parser_entities.py (original)
+++ trunk/frontend-web/parser_entities.py Wed Jul 4 00:09:20 2007
@@ -1,9 +1,9 @@
# $Id$
-def parse(text):
+def parse(text, context = None):
import cgi
# Devolvemos "False" para que si ponemos algun otro parser detras (!no lo hagas!)
# dicho parser no haga nada. La idea es que al investigarlo nos encontremos
# con que lo hemos puesto donde no es...
- return False,[(False, cgi.escape(text))]
+ return False,[(False, cgi.escape(text), context)]
Modified: trunk/frontend-web/parser_eol.py
==============================================================================
--- trunk/frontend-web/parser_eol.py (original)
+++ trunk/frontend-web/parser_eol.py Wed Jul 4 00:09:20 2007
@@ -1,11 +1,11 @@
# $Id$
-def parse(text):
- eol=(False,"<br/>")
+def parse(text, context = None):
+ eol=(False,"<br/>", None)
token_list=[]
text=text.replace("\r","")
for fragment in text.split("\n") :
- token_list.extend([(True,fragment),eol])
+ token_list.extend([(True,fragment, context),eol])
token_list.pop() # Al final hay un "eol" de mas
return False,token_list
Modified: trunk/frontend-web/parser_html.py
==============================================================================
--- trunk/frontend-web/parser_html.py (original)
+++ trunk/frontend-web/parser_html.py Wed Jul 4 00:09:20 2007
@@ -31,8 +31,16 @@
from sgmllib import SGMLParser, SGMLParseError
class HTMLParser(SGMLParser):
- def __init__(self):
+ def __init__(self, context = None):
+ self._context = context
SGMLParser.__init__(self)
+
+ def get_context(self):
+ return self._stack[-1] if self._stack else None
+
+ def check_context(self,tag):
+ return True
+
def reset(self):
self.entity = False
self.tokens = []
@@ -49,19 +57,21 @@
raise SGMLParseError, "Error, argumentos para <img> no validos"
strattrs = "".join(attrs_list)
text = "<img%(strattrs)s />" % locals()
- self.tokens.append((False, text))
+ self.tokens.append((False, text, "img"))
def end_img(self):
pass
def do_br(self):
- self.tokens.append((False, "<br />"))
+ self.tokens.append((False, "<br />", None))
def end_br(self):
pass
def unknown_starttag(self, tag, attrs):
if tag in allowed_html.keys():
+ if tag == self._context or tag=="a" and self._stack and self._stack[-1] == "a":
+ raise SGMLParseError, "Error, tag <%s> no permitido dentro de <%s>" % (tag, self._stack[-1])
attrs_list = []
for a, v in attrs:
if a in allowed_html[tag] and v:
@@ -70,64 +80,68 @@
raise SGMLParseError, "Error, argumentos para <%s> no validos" % tag
strattrs = "".join(attrs_list)
text = "<%(tag)s%(strattrs)s>" % locals()
- self.tokens.append((False, text))
self._stack.append(tag)
+ self.tokens.append((False, text, self.get_context()))
else:
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
text = escape("<%(tag)s%(strattrs)s>" % locals())
- self.tokens.append((False, text)) # XXX
+ self.tokens.append((False, text, self.get_context())) # XXX
def unknown_endtag(self, tag):
if tag in allowed_html.keys():
- text = "</%(tag)s>" % locals()
- self.tokens.append((False, text))
if self._stack and tag == self._stack[-1]:
+ text = "</%(tag)s>" % locals()
+ self.tokens.append((False, text, self.get_context()))
self._stack.pop(-1)
else:
raise SGMLParseError, "Error, tag <%s> sin abrir" % tag
else:
text = escape("</%(tag)s>" % locals())
- self.tokens.append((False, text)) # XXX
+ self.tokens.append((False, text, self.get_context())) # XXX
def handle_charref(self, ref):
self.entity = True
- if not self.tokens: self.tokens.append((True, ""))
+ if not self.tokens:
+ self.tokens.append((True, "", self.get_context()))
text = "%s&#%s;" % (self.tokens[-1][1], ref)
old_status = self.tokens[-1][0]
- self.tokens[-1] = (old_status, text) # FIXME
+ self.tokens[-1] = (old_status, text, self.get_context()) # FIXME
def handle_entityref(self, ref): # FIXME
self.entity = True
- if not self.tokens: self.tokens.append((True, ""))
+ if not self.tokens:
+ self.tokens.append((True, "", self.get_context()))
text = "%s&%s" % (self.tokens[-1][1], ref)
import htmlentitydefs
if htmlentitydefs.entitydefs.has_key(ref):
text+=";"
old_status = self.tokens[-1][0]
- self.tokens[-1] = (old_status, text) # FIXME
+ self.tokens[-1] = (old_status, text, self.get_context()) # FIXME
def handle_data(self, text):
import xml.sax.saxutils
if self.entity:
self.entity = False
- if not self.tokens: self.tokens.append((True, ""))
+ if not self.tokens:
+ self.tokens.append((True, "", self.get_context()))
old_status = self.tokens[-1][0]
text = "%s%s" % (self.tokens[-1][1], text)
- self.tokens[-1] = (old_status, text)
+ self.tokens[-1] = (old_status, text, self.get_context())
elif text.startswith("&"):
self.entity = True
- if not self.tokens: self.tokens.append((True, ""))
+ if not self.tokens:
+ self.tokens.append((True, "", self.get_context()))
old_status = self.tokens[-1][0]
text = "%s%s" % (self.tokens[-1][1], text)
- self.tokens[-1] = (old_status, text)
+ self.tokens[-1] = (old_status, text, self.get_context())
else:
- self.tokens.append((True, text))
+ self.tokens.append((True, text, self.get_context()))
def get_text(self):
"""Return processed HTML as a single string"""
if self._stack:
raise SGMLParseError, "Error, tag <%s> sin cerrar" % self._stack.pop(-1)
- return "".join([text for dummy, text in self.tokens])
+ return "".join([text for dummy, text, dummy in self.tokens])
def get_tokens(self):
if self._stack:
@@ -135,11 +149,11 @@
return self.tokens
-def parse(text):
+def parse(text, context = None):
"""Parses the text into HTML. It returns the exit code and a list of tuples
in the form [(Boolean,"text")]. The first element tells whether "text" was
untouched or analysed."""
- parser = HTMLParser()
+ parser = HTMLParser(context = None)
parser.reset()
try:
parser.feed(text)
@@ -175,6 +189,7 @@
'<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla',
'<a href="http://blablabla.com/index.php?caca=1&mierda=2">bla bla </as>',
'<a href="http://blablabla.com/index.php?caca=1&mierda=2" taitle="caca">bla bla </a>',
+ '<a href="http://blablabla.com/index.php?caca=1&mierda=2" taitle="caca">bla bla </a>',
'<a href="http://blablabla.com/index.php?caca=1&mierda=2" title="caca" de la vaca>bla bla <a>',
'<img sorcerer="http://blablabla.com/index.php?caca=1&mierda=2" />',
'<img src="http://blablabla.com/index.php?caca=1&mierda=2" altibajo="caca">',
@@ -203,11 +218,11 @@
html_list.append(html)
parsed_html_list.append(parsed_html)
assert not retval
- self.assertEqual(parsed_html, "".join(text for dummy, text in result))
+ self.assertEqual(parsed_html, "".join(text for dummy, text, dummy in result))
retval, result = parse("".join(html_list))
assert not retval
- self.assertEqual("".join(parsed_html_list), "".join(text for dummy, text in result))
+ self.assertEqual("".join(parsed_html_list), "".join(text for dummy, text, dummy in result))
def testIncorrectHTML(self):
"""Checks if the conversion fails w/ bad formed HTML."""
Copied: trunk/frontend-web/parser_urls.py (from r235, /trunk/frontend-web/parser_eol.py)
==============================================================================
--- /trunk/frontend-web/parser_eol.py (original)
+++ trunk/frontend-web/parser_urls.py Wed Jul 4 00:09:20 2007
@@ -1,11 +1,78 @@
# $Id$
-def parse(text):
- eol=(False,"<br/>")
- token_list=[]
- text=text.replace("\r","")
- for fragment in text.split("\n") :
- token_list.extend([(True,fragment),eol])
- token_list.pop() # Al final hay un "eol" de mas
- return False,token_list
+allowed_urls = ['http://', 'ftp://']
+def url_o_matic(url):
+ return """<a href="%(url)s" title="%(url)s">%(url)s</a>""" % locals()
+
+def parse(text, context = None):
+ if context == "a":
+ return None, [(True,text,context)]
+ import re
+ # Everybody stand back!!!
+ regexp = re.compile("|".join(["(\s|\A)(%s(?:[a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}(?:/\S+)*/{0,1})(\s|\Z)" % i for i in allowed_urls]))
+ list = regexp.split(text)
+ tokens = []
+ for i in list:
+ if i and regexp.match(i):
+ tokens.append((False, url_o_matic(i), "a"))
+ elif i:
+ tokens.append((True, i, context))
+ return None, tokens
+
+import unittest
+class Test(unittest.TestCase):
+ urls = (
+ 'http://perseverantia.com',
+ 'http://perseverantia.com/',
+ 'http://perseverantia.com/caca/',
+ 'http://perseverantia.com/caca/de/la/vaca',
+ 'http://perseverantia.com/caca?mierda=grande',
+ 'http://perseverantia.com/caca?mierda=pa&ti&comerla',
+ 'http://www.perseverantia.com',
+ 'http://www.www.perseverantia.com',
+ 'http://buh.bih.bah.beh.perseverantia.com',
+ )
+ badurls = (
+ 'htp://perseverantia.com',
+ 'http:/perseverantia.com',
+ 'http//perseverantia.com',
+ 'http/perseverantia.com',
+ 'http://perseverantia.c/',
+ 'http://perseverantia.c/',
+ 'http://.com',
+ 'hattp://perseverantia.comememela',
+ )
+
+ def testConversion(self):
+ retval, tokens = parse("http://perseverantia.com")
+ if retval:
+ self.fail()
+ self.assertEqual(tokens[0][1], """<a href="http://perseverantia.com" title="http://perseverantia.com">http://perseverantia.com</a>""")
+
+ def testURLbad(self):
+ import re
+ regexp = re.compile("|".join(["(?:\s|\A)(%s(?:[a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}(?:/\S+)*)/{0,1}(?:\s|\Z)" % i for i in allowed_urls]))
+ for i in self.badurls:
+ self.failIf(regexp.findall(i))
+ import random
+ rnd = random.sample("""?=)(/&%$"!'0987654321+-.,..,;:_"**$caca """,10)
+ self.failIf(regexp.findall("%s%s%s" % (rnd, i, rnd)))
+ for i in self.urls:
+ import random
+ rnd = random.sample("""?=)(/&%$"!'0987654321+-.,..,;:_"**$caca """,10)
+ self.failIf(regexp.findall("%s%s%s" % (rnd, i, rnd)))
+
+ def testURLok(self):
+ import re
+ regexp = re.compile("|".join(["(\s|\A)(%s(?:[a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}(?:/\S+)*/{0,1})(\s|\Z)" % i for i in allowed_urls]))
+ for i in self.urls:
+ self.assert_(regexp.findall(i))
+ import random
+ rnd = random.sample("""?=)(/&%$"!'0987654321+-.,..,;:_"**$%caca """,10)
+ self.assert_(regexp.findall("%s %s %s" % (rnd, i, rnd)))
+
+
+if __name__ == "__main__":
+ import unittest
+ unittest.main()
Modified: trunk/frontend-web/parsers.py
==============================================================================
--- trunk/frontend-web/parsers.py (original)
+++ trunk/frontend-web/parsers.py Wed Jul 4 00:09:20 2007
@@ -2,6 +2,7 @@
import parser_bbcode
import parser_html
+import parser_urls
import parser_eol
import parser_entities
@@ -9,11 +10,12 @@
"""Converts the text into valid HTML."""
from globales import allow_bbcode, allow_html
- tokens = [(True, text)]
+ tokens = [(True, text, None)]
parsers = [
(allow_html, parser_html),
(allow_bbcode, parser_bbcode),
+ (True, parser_urls),
(True, parser_eol),
# Salvo que sepas muy bien que estas haciendo,
# este parser debe ser el ultimo de todos.
@@ -25,37 +27,52 @@
aux = []
for token in tokens:
token_list=[token]
- must_reparse,fragment=token
+ must_reparse, fragment, curr_context=token
if must_reparse :
- retval, token_list = parser.parse(fragment)
+ retval, token_list= parser.parse(fragment, curr_context)
if retval :
return retval, None
aux+=token_list
tokens = aux
- return False, "".join([text for dummy, text in tokens])
+ return False, "".join([text for dummy, text, dummy in tokens])
import unittest
class TestBBCodeHTML(unittest.TestCase):
"""Test Case for the BBCode and HTML parser together"""
- correct_bbcode = (
+ correct_code = (
('[url="http://example.org/uno&dos"]el texto[/url]', '<a href="http://example.org/uno&dos">el texto</a>'),
('[url="http://example.org/uno&dos"]el & texto[/url]', '<a href="http://example.org/uno&dos">el & texto</a>'),
('[url="http://example.org/uno&dos"]el & texto[/url]', '<a href="http://example.org/uno&dos">el &amp; texto</a>'),
('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&dos">el "" texto</a>'),
-)
+ ('[url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://example.org/uno&dos">el "" texto</a>'),
+ ('http://caca.com [url="http://example.org/uno&dos"]el "" texto[/url]', '<a href="http://caca.com" title="http://caca.com">http://caca.com</a> <a href="http://example.org/uno&dos">el "" texto</a>'),
+ )
+ incorrect = (
+ '<a>paco<a>luis</a>manolo</a>ringesvinto',
+ '<a>paco[url]luis[/url]>manolo</a>ringesvinto',
+ '[url]paco<a>luis</a>manolo[/url]ringesvinto',
+ )
+
+ def testIncorrectCode(self):
+ """Test if the translation goes well."""
+ global allow_errors
+ allow_errors = False
+ for code in self.incorrect:
+ retval, caca = convert_to_html(code)
+ assert retval
- def testCorrectBBCode(self):
+ def testCorrectCode(self):
"""Test if the translation goes well."""
global allow_errors
allow_errors = False
- for bbcode, html in self.correct_bbcode:
- retval, result = convert_to_html(bbcode)
+ for code, html in self.correct_code:
+ retval, result = convert_to_html(code)
assert not retval
self.assertEqual(html, result)
if __name__ == "__main__":
- test1 = parser_html.TestHTML
- test2 = parser_bbcode.TestBBCode
+# test1 = parser_html.TestHTML
+# test2 = parser_bbcode.TestBBCode
import unittest
unittest.main()
More information about the cpif
mailing list