1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
from pyspider.libs.base_handler import * import re
class Handler(BaseHandler): crawl_config = { } def __init__(self): key_words = ['空调','家居装修','家居','家电','电饭煲','微波炉','烤箱','洗衣机','空调','冰箱','电视'] url_list = [] for key_word in key_words: start_url1 = 'http://wenwen.qq.com/s/?w='+key_word+'&pg=' start_url2 = '&ch=sp.pt' for i in range(100): su = start_url1+str(i)+start_url2 url_list.append(su) self.url_list = url_list @every(minutes=24 * 60) def on_start(self): for url in self.url_list: start_url = url self.crawl(start_url,headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', }, callback=self.index_page)
@config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('#result_list > div > h3 > a').items(): if (response.doc('a > span').text() == u'略懂社'): qid = str(re.findall(r'/z/q\d+',each.attr.href))[7:-2] url = 'http://wenwen.qq.com/qunapp/world/question?qid='+qid+'&viewsrc=1&ch=fromsearch.pc' self.crawl(url, callback=self.detail_page,fetch_type='js') else: self.crawl(each.attr.href, callback=self.detail_page2) @config(priority=2) def detail_page(self, response): return { "title": response.doc('.yy-mt1 > pre').text(), "text": response.doc('.answer_text').text(), } @config(priority=2) def detail_page2(self, response): return { "title": response.doc('.question-tit > h3').text(), "text": response.doc('.answer-con').text(), } """ def list_page(self, response): # 这是一个翻页的回调函数 我没有用 for each in response.doc('#wgt-list > dl > dt > a').items(): self.crawl(each.attr.href, callback=self.detail_page) # 翻页 # 这里加入翻页的 selector for each in response.doc('#page-main > div > div > div > div.list-inner > div.widget-pager.clearfix.mb-20 > div > a.pager-next').items(): self.crawl(each.attr.href, callback=self.list_page) """
|