博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
pyspider--post
阅读量:5329 次
发布时间:2019-06-14

本文共 2120 字,大约阅读时间需要 7 分钟。

#!/usr/bin/env python

# -*- encoding: utf-8 -*-
# Created on 2018-08-19 14:47:28
# Project: HBGGZY_SBJ

import json
import pymongo
import hashlib
from bs4 import BeautifulSoup
from pyspider.libs.base_handler import *

class Handler(BaseHandler):
def __init__(self):
self.data = json.dumps({"token":"","pn":0,"rn":10,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"title","cnum":"001","sort":"{\"showdate\":\"0\"}","ssort":"title","cl":200,"terminal":"","condition":[{"fieldName":"categorynum","isLike":"true","likeType":2,"equal":"003005"},{"fieldName":"infoc","isLike":"true","likeType":2,"equal":"1300"}],"time":"null","highlights":"title","statistics":"null","unionCondition":"null","accuracy":"","noParticiple":"0","searchRange":"null","isBusiness":1})
self.Client = pymongo.MongoClient()
self.db = self.Client["Tender"]
self.tb = self.db['HBGGZY']
crawl_config = {
}

@every(minutes=15)

def on_start(self):
self.crawl("http://www.hebpr.cn/inteligentsearch/rest/inteligentSearch/getFullTextDataNew",callback=self.index_page,data=self.data,age=60)
@config(age=20 * 24 *60 * 60)
def index_page(self, response):
data = json.loads(response.text)['result']['records']
for item in data:
self.crawl('http://www.hebpr.cn'+item['linkurl'],callback=self.detail_page,save={'title':item['title'],'show_date':item['showdate'],'province':'河北省','city':'省本级','county':item['zhuanzai']})

@config(priority=2)

def detail_page(self, response):
sha1_scripy = hashlib.sha1(response.doc('title').text().encode('utf8'))
sha1_title = sha1_scripy.hexdigest()
data = {
"url": response.url,
"title": response.doc('title').text().encode('utf8'),
"content":str(BeautifulSoup(response.text).find_all("div",id="hideDeil")[0]),
"show_date":response.save["show_date"].split(" ")[0],
"province":response.save["province"],
"city":response.save["city"],
"county":response.save["county"],
"sha1_title":sha1_title,
"is_indb":"0",
"province_id":"130000",
"city_id":"0",
"county_id":"0",
}
MyQuery = self.tb.find({"sha1_title":sha1_title})
if MyQuery.count()>0:
print "存在了"
else:
self.tb.insert(data)

 

转载于:https://www.cnblogs.com/pxfb/p/9502323.html

你可能感兴趣的文章
【Crash Course Psychology】2. Research & Experimentation笔记
查看>>
关于 linux 的 limit 的设置
查看>>
MTK笔记
查看>>
ERROR: duplicate key value violates unique constraint "xxx"
查看>>
激活office 365 的启动文件
查看>>
无法根据中文查找
查看>>
[简讯]phpMyAdmin项目已迁移至GitHub
查看>>
转载 python多重继承C3算法
查看>>
【题解】 bzoj1597: [Usaco2008 Mar]土地购买 (动态规划+斜率优化)
查看>>
css文本溢出显示省略号
查看>>
git安装和简单配置
查看>>
fat32转ntfs ,Win7系统提示对于目标文件系统文件过大解决教程
查看>>
Awesome Adb——一份超全超详细的 ADB 用法大全
查看>>
shell cat 合并文件,合并数据库sql文件
查看>>
Android 将drawable下的图片转换成bitmap、Drawable
查看>>
介绍Win7 win8 上Java环境的配置
查看>>
Linux设置环境变量的方法
查看>>
构建自己的项目管理方案
查看>>
利用pca分析fmri的生理噪声
查看>>
div水平居中且垂直居中
查看>>