I've been trying to use html5lib with lxml on python 2.7 in google app engine. But when I run the following code, it gives me an error saying "NameError: global name 'etree' is not defined". Is it not possible to use lxml.etree on google app engine? or am I missing something?
app.yaml
application: testsite
version: 1
runtime: python27
api_version: 1
threadsafe: falsehandlers:
- url: /.*script: index.py libraries:
- name: lxmlversion: "2.3" # I thought this would allow me to use lxml.etree
index.py
from testhandler import TestHandler
application = webapp.WSGIApplication([('/', TestHandler)], debug=True)
testhandler.py
import urllib2
import html5lib
from html5lib import treebuilders
try:from lxml import etreeprint("running with lxml.etree")
except ImportError:try:# Python 2.5import xml.etree.cElementTree as etreeprint("running with cElementTree on Python 2.5+")except ImportError:try:# Python 2.5import xml.etree.ElementTree as etreeprint("running with ElementTree on Python 2.5+")except ImportError:try:# normal cElementTree installimport cElementTree as etreeprint("running with cElementTree")except ImportError:try:# normal ElementTree installimport elementtree.ElementTree as etreeprint("running with ElementTree")except ImportError:print("Failed to import ElementTree from any known place")from google.appengine.ext import webappclass TestHandler(webapp.RequestHandler):def get(self):f = urllib2.urlopen("http://www.yahoo.com/").read()doc = html5lib.parse(f, treebuilder='lxml')elems = doc.xpath("//*[local-name() = 'a']")self.response.out.write(len(elems))
error
running with cElementTree on Python 2.5+
Status: 500 Internal Server Error
Content-Type: text/html; charset=utf-8
Cache-Control: no-cache
Expires: Fri, 01 Jan 1990 00:00:00 GMT
Content-Length: 769<pre>Traceback (most recent call last):File "/usr/local/bin/google_appengine/google/appengine/ext/webapp/_webapp25.py", line 701, in __call__
handler.get(*groups)File "/home/test/testhandler.py", line 38, in getparser = html5lib.HTMLParser(tree= treebuilders.getTreeBuilder('lxml'))File "/home/test/html5lib/html5parser.py", line 68, in __init__self.tree = tree(namespaceHTMLElements)File "/home/test/html5lib/treebuilders/etree_lxml.py", line 176, in __init__builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
NameError: global name 'etree' is not defined
</pre>
ADD
Nah, I tried several ways to create a doc object, but no luck. One of the ways, I tried to import from lxml.html import document_fromstring
and that gives me this error.
Traceback (most recent call last):File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 4143, in _HandleRequestself._Dispatch(dispatcher, self.rfile, outfile, env_dict)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 4049, in _Dispatchbase_env_dict=env_dict)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 616, in Dispatchbase_env_dict=base_env_dict)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 3120, in Dispatchself._module_dict)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 3024, in ExecuteCGIreset_modules = exec_script(handler_path, cgi_path, hook)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2887, in ExecuteOrImportScriptexec module_code in script_module.__dict__File "/home/yoo/eclipse_workspace/website_checker/src/index.py", line 5, in <module>from handlers.updatecheck import UpdateCheckHandlerFile "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2503, in load_modulereturn self.FindAndLoadModule(submodule, fullname, search_path)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2375, in FindAndLoadModuledescription)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2318, in LoadModuleRestricteddescription)File "/home/test/updatecheck.py", line 4, in <module>from lxml.html import document_fromstringFile "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2503, in load_modulereturn self.FindAndLoadModule(submodule, fullname, search_path)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2375, in FindAndLoadModuledescription)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 1538, in Decoratereturn func(self, *args, **kwargs)File "/usr/local/bin/google_appengine/google/appengine/tools/dev_appserver.py", line 2318, in LoadModuleRestricteddescription)File "/usr/lib/python2.7/dist-packages/lxml/html/__init__.py", line 12, in <module>from lxml import etree
ImportError: cannot import name etree
According to the error, it seems app engine doesn't allow me to load etree module for some reason. I wanted to use xpath with lxml, but I can't spend much time to figure out what is going on here and don't have enough knowledge of python either. So I would give a try to find a way with 'simpletree' version.
f = urllib2.urlopen("http://www.yahoo.com/").read()
p = html5lib.HTMLParser()
doc = p.parse(f)
# do something with doc.childNodes
self.response.out.write(len(doc.childNodes))
Not really a good way, but at least it worked when I tested on live google app engine.