I am following this guide for scraping data from instagram: http://www.spataru.at/scraping-instagram-scrapy/ but I get this error:
mona@pascal:~/computer_vision/instagram/instagram$ ls
instagram scrapy.cfg
mona@pascal:~/computer_vision/instagram/instagram$ scrapy crawl instagramspider
2017-03-01 15:30:10-0600 [scrapy] INFO: Scrapy 0.14.4 started (bot: instagram)
2017-03-01 15:30:10-0600 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
Traceback (most recent call last):File "/usr/bin/scrapy", line 4, in <module>execute()File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 132, in execute_run_print_help(parser, _run_command, cmd, args, opts)File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 97, in _run_print_helpfunc(*a, **kw)File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 139, in _run_commandcmd.run(args, opts)File "/usr/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 43, in runspider = self.crawler.spiders.create(spname, **opts.spargs)File "/usr/lib/python2.7/dist-packages/scrapy/command.py", line 34, in crawlerself._crawler.configure()File "/usr/lib/python2.7/dist-packages/scrapy/crawler.py", line 36, in configureself.spiders = spman_cls.from_crawler(self)File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 37, in from_crawlerreturn cls.from_settings(crawler.settings)File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 33, in from_settingsreturn cls(settings.getlist('SPIDER_MODULES'))File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 23, in __init__for module in walk_modules(name):File "/usr/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 65, in walk_modulessubmod = __import__(fullpath, {}, {}, [''])File "/home/mona/computer_vision/instagram/instagram/instagram/spiders/spider.py", line 3, in <module>from scraper_user.items import UserItem
ImportError: No module named scraper_user.items
I followed the complete tutorial so I am not sure what's missing or how to fix this?
Here's the folder structure:
mona@pascal:~/computer_vision/instagram/instagram$ tree .
.
├── instagram
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── spider.py
│ └── spider.pyc
└── scrapy.cfg2 directories, 11 files
here's the spider.py code from the tutorial:
1 import scrapy2 import json3 from scraper_user.items import UserItem4 from scraper_user.items import PostItem5 6 7 class InstagramSpider(scrapy.Spider):8 9 name = 'instagramspider'10 allowed_domains = ['instagram.com']11 start_urls = []12 13 def __init__(self):14 self.start_urls = ["https://www.instagram.com/_spataru/?__a=1"]15 16 def parse(self, response):17 #get the json file18 json_response = {}19 try:20 json_response = json.loads(response.body_as_unicode())21 except:22 self.logger.info('%s doesnt exist', response.url)23 pass24 if json_response["user"]["is_private"]:25 return;26 #check if the username even worked27 try:28 json_response = json_response["user"]29 30 item = UserItem()31 32 #get User Info33 item["username"] = json_response["username"]34 item["follows_count"] = json_response["follows"]["count"]35 item["followed_by_count"] = json_response["followed_by"]["count"]36 item["is_verified"] = json_response["is_verified"]37 item["biography"] = json_response.get("biography")38 item["external_link"] = json_response.get("external_url")39 item["full_name"] = json_response.get("full_name")40 item["posts_count"] = json_response.get("media").get("count")41 42 #interate through each post43 item["posts"] = []44 45 json_response = json_response.get("media").get("nodes")46 if json_response:47 for post in json_response:48 items_post = PostItem()49 items_post["code"]=post["code"]50 items_post["likes"]=post["likes"]["count"]51 items_post["caption"]=post["caption"]52 items_post["thumbnail"]=post["thumbnail_src"]53 item["posts"].append(dict(items_post))54 55 return item56 except:57 self.logger.info("Error during parsing %s", response.url)
Here's the items.py code:
1 import scrapy2 3 class UserItem(scrapy.Item):4 username = scrapy.Field()5 follows_count = scrapy.Field()6 followed_by_count = scrapy.Field()7 is_verified = scrapy.Field()8 biography = scrapy.Field()9 external_link = scrapy.Field()10 full_name = scrapy.Field()11 posts_count = scrapy.Field()12 posts = scrapy.Field()13 14 15 class PostItem(scrapy.Item):16 code = scrapy.Field()17 likes = scrapy.Field()18 thumbnail = scrapy.Field()19 caption = scrapy.Field()20 hashtags = scrapy.Field()