• 欢迎访问小杰博客网站
  • 欢迎访问小杰博客网站哦

scrapy批量执行爬虫代码

未分类 小杰 3年前 (2016-03-09) 224次浏览 已收录 0个评论

#-*- coding: UTF-8 -*-

# 一些没爬取成功的网站重新做一遍

import time  
import MySQLdb  
import MySQLdb.cursors
import json
import sys

from scrapy.commands import ScrapyCommand  
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict
from scrapy import log  
#from scrapy.core.exceptions import DropItem  
from twisted.enterprise import adbapi  
from scrapy.http import Request  
from scrapy.exceptions import DropItem  
from scrapy.pipelines.images import ImagesPipeline  
from scrapy.utils.project import get_project_settings

sys.stdout=open(‘crawl_fail.txt’,’w’)

class Command(ScrapyCommand):
    requires_project = True
  
    def syntax(self):  
        return ‘[options]’  
  
    def short_desc(self):  
        return ‘Runs fail of the spiders’  

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option(“-a”, dest=”spargs”, action=”append”, default=[], metavar=”NAME=VALUE”,
                          help=”set spider argument (may be repeated)”)
        parser.add_option(“-o”, “–output”, metavar=”FILE”,
                          help=”dump scraped items into FILE (use – for stdout)”)
        parser.add_option(“-t”, “–output-format”, metavar=”FORMAT”,
                          help=”format to use for dumping items with -o”)

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError(“Invalid -a value, use -a NAME=VALUE”, print_help=False)

    def run(self, args, opts):
#         conn=MySQLdb.connect(host=’localhost’,user=’root’,passwd=”,db=’literature’,port=3306)
        conn=MySQLdb.connect(host=’localhost’,user=’root’,passwd=’123456′,db=’literature’,port=3306)
        cur=conn.cursor()
    #settings = get_project_settings()
    spider_loader = self.crawler_process.spider_loader
        #不放这里会打印到output.txt里面
        sys.stdout=open(‘crawl_fail.txt’,’w’)
    for spidername in args or spider_loader.list():
            print time.strftime(“%Y-%m-%d %H:%M:%S”)+’ spidername:’+spidername+’ start’
            try:
                has_exist = cur.execute(“select name from books_sources where site='”+spidername+”‘ and create_time > curdate() limit 1”)
                
                if has_exist:
                    print “*********crawlsuccess spidername************” + spidername
                else:
                    print “*********crawlfail spidername************” + spidername
#                     if(spidername in [‘myuyueread’]):
                    self.crawler_process.crawl(spidername, **opts.spargs)
                    self.crawler_process.start()
            
            except MySQLdb.Error,e:
                 print “Mysql Error %d: %s” % (e.args[0], e.args[1])
     
        cur.close()
        conn.close()    

    def open_spider(self, spider):
        print “open spider”


小杰博客 , 版权所有丨如未注明 , 均为原创丨本网站采用BY-NC-SA协议进行授权
转载请注明原文链接:scrapy批量执行爬虫代码
喜欢 (0)
发表我的评论
取消评论
表情 贴图 加粗 删除线 居中 斜体 签到

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址