#-*- coding: UTF-8 -*-
# 一些没爬取成功的网站重新做一遍
import time
import MySQLdb
import MySQLdb.cursors
import json
import sys
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict
from scrapy import log
#from scrapy.core.exceptions import DropItem
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings
sys.stdout=open(‘crawl_fail.txt’,’w’)
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return ‘[options]’
def short_desc(self):
return ‘Runs fail of the spiders’
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option(“-a”, dest=”spargs”, action=”append”, default=[], metavar=”NAME=VALUE”,
help=”set spider argument (may be repeated)”)
parser.add_option(“-o”, “–output”, metavar=”FILE”,
help=”dump scraped items into FILE (use – for stdout)”)
parser.add_option(“-t”, “–output-format”, metavar=”FORMAT”,
help=”format to use for dumping items with -o”)
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError(“Invalid -a value, use -a NAME=VALUE”, print_help=False)
def run(self, args, opts):
# conn=MySQLdb.connect(host=’localhost’,user=’root’,passwd=”,db=’literature’,port=3306)
conn=MySQLdb.connect(host=’localhost’,user=’root’,passwd=’123456′,db=’literature’,port=3306)
cur=conn.cursor()
#settings = get_project_settings()
spider_loader = self.crawler_process.spider_loader
#不放这里会打印到output.txt里面
sys.stdout=open(‘crawl_fail.txt’,’w’)
for spidername in args or spider_loader.list():
print time.strftime(“%Y-%m-%d %H:%M:%S”)+’ spidername:’+spidername+’ start’
try:
has_exist = cur.execute(“select name from books_sources where site='”+spidername+”‘ and create_time > curdate() limit 1”)
if has_exist:
print “*********crawlsuccess spidername************” + spidername
else:
print “*********crawlfail spidername************” + spidername
# if(spidername in [‘myuyueread’]):
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()
except MySQLdb.Error,e:
print “Mysql Error %d: %s” % (e.args[0], e.args[1])
cur.close()
conn.close()
def open_spider(self, spider):
print “open spider”