#!/usr/bin/env python
# -*- encoding: utf-8 -*-# Created on 2018-08-19 14:47:28# Project: HBGGZY_SBJ import jsonimport pymongoimport hashlibfrom bs4 import BeautifulSoupfrom pyspider.libs.base_handler import * class Handler(BaseHandler): def __init__(self): self.data = json.dumps({"token":"","pn":0,"rn":10,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"title","cnum":"001","sort":"{\"showdate\":\"0\"}","ssort":"title","cl":200,"terminal":"","condition":[{"fieldName":"categorynum","isLike":"true","likeType":2,"equal":"003005"},{"fieldName":"infoc","isLike":"true","likeType":2,"equal":"1300"}],"time":"null","highlights":"title","statistics":"null","unionCondition":"null","accuracy":"","noParticiple":"0","searchRange":"null","isBusiness":1}) self.Client = pymongo.MongoClient() self.db = self.Client["Tender"] self.tb = self.db['HBGGZY'] crawl_config = { }@every(minutes=15)
def on_start(self): self.crawl("http://www.hebpr.cn/inteligentsearch/rest/inteligentSearch/getFullTextDataNew",callback=self.index_page,data=self.data,age=60) @config(age=20 * 24 *60 * 60) def index_page(self, response): data = json.loads(response.text)['result']['records'] for item in data: self.crawl('http://www.hebpr.cn'+item['linkurl'],callback=self.detail_page,save={'title':item['title'],'show_date':item['showdate'],'province':'河北省','city':'省本级','county':item['zhuanzai']})@config(priority=2)
def detail_page(self, response): sha1_scripy = hashlib.sha1(response.doc('title').text().encode('utf8')) sha1_title = sha1_scripy.hexdigest() data = { "url": response.url, "title": response.doc('title').text().encode('utf8'), "content":str(BeautifulSoup(response.text).find_all("div",id="hideDeil")[0]), "show_date":response.save["show_date"].split(" ")[0], "province":response.save["province"], "city":response.save["city"], "county":response.save["county"], "sha1_title":sha1_title, "is_indb":"0", "province_id":"130000", "city_id":"0", "county_id":"0", } MyQuery = self.tb.find({"sha1_title":sha1_title}) if MyQuery.count()>0: print "存在了" else: self.tb.insert(data)