loading...
爬虫编写中常用的cv代码
Published in:2021-06-20 | category: 搬砖
Words: 2.8k | Reading time: 15min | reading:

CV 用的好,天天下班早(想得美)

时间格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 时间有几天前,的那种论坛贴吧
pubtime_string = response.xpath("//div[@class='time']/span/@title").get()
pubtime_string = pubtime_string if pubtime_string != None else response.xpath("string(//div[@class='time'])").get()
pubtime = re.findall("\d{4}年\d{2}月\d{2}日\s*\d{2}\:\d{2}",str(pubtime_string))[0]
pubtime = re.search("\d{4}-\d{2}-\d{2} \d{2}:\d{2}",str(pubtime)).group()
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")

item['pubtime'] = pubtime

# 其他格式的时间格式化成需要的格式
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")
datetime.datetime.strptime(pubtime, "%Y-%m-%d %H:%M").strftime("%Y-%m-%d %H:%M")

# 只有日期补全格式
+ time.strftime(' %H:%M')

# 去除空格和换行符
pubtime = response.xpath("//div[@class='noticepubtime bshare-custom']/text()").get().replace('\r','').replace('\n','').strip()

时间戳格式转化

1
2
timeStamp = data['createTime']
item['pubtime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime(float(timeStamp)))

需要代理的爬虫制式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
 # 爬虫文件
def __init__(self, start_url=None):
super(JuronggovSpider, self).__init__()
if start_url:
jobs = start_url.split('|')
else:
jobs = self.start_urls
for job in jobs:
self.post_params.append({"url": job.split()[0]})
self.proxy_list = ["一个代理","两个代理"]

def start_requests(self):
for channel in self.post_params:
url = channel['url']
post_data = "wlwzTypeId=%s" %channel['channel_code']
proxy = random.choice(self.proxy_list) if self.proxy_list else None
yield Request(url=url, method='post', body=post_data, headers=self.headers,
meta={"channel": channel["channel"],"proxy":proxy},
callback=self.parse_link)

# 后续在meta中传递
self.proxy_list = self.configure.get_proxy_from_api("common", 100, 'zm')
proxy = random.choice(self.proxy_list) if self.proxy_list else None
,"proxy":proxy
,"proxy":response.meta['proxy']
1
2
3
4
5
6
7
8
9
# special_config
class SpecialConfig():
interface = base_config.InterfaceServe()

@staticmethod
def get_proxy_from_api(href=None, tp=None, response=None):
return ['http://121.207.92.119:50269', 'http://117.94.182.206:39792', 'http://183.154.51.253:40689']

# 运行一次,将代理ip复制到这边

商情type修正

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
  @staticmethod
def get_type(title):
if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title:
flag = "RN"
elif u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title:
flag = "CN"
elif u"预审" in title:
flag = "PF"
else:
flag = "PN"
return flag

# 使用
item['type'] = self.get_type(item['title'],item['type'])
# 加强版
@staticmethod
def get_type(title,type):
if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title:
flag = "RN"
elif u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title:
flag = "CN"
elif u"预审" in title:
flag = "PF"
elif u'招标' in title or u'采购' in title or u'谈判' in title or u'询价' in title or u'单一来源' in title\
or u'最高限价' in title or u'磋商' in title:
flag = "PN"
else:
flag = type
return flag

3天前 2天前 昨天 3小时前 49分钟前 2021年5月12日

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
@staticmethod
def format_time(time_string):
curr_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d %H:%M"), "%Y-%m-%d %H:%M")
pubtime = curr_time
print time_string
if u"3天前" in time_string:
delta = datetime.timedelta(days=3)
pubtime = curr_time - delta
elif u"2天前" in time_string:
delta = datetime.timedelta(days=2)
pubtime = curr_time - delta
elif u"昨天" in time_string:
delta = datetime.timedelta(days=1)
pubtime = curr_time - delta
elif u"年" in time_string:
pubtime = re.findall("\d{4}年\d{2}月\d{2}日",str(time_string))[0]
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日").strftime("%Y-%m-%d") + time.strftime(' %H:%M')
return pubtime
else:
if u"分钟" in time_string:
minute = re.findall("\d*",time_string)[0]
delta = datetime.timedelta(minutes=int(minute))
pubtime = curr_time - delta
elif u"小时" in time_string:
minute = re.findall("\d*",time_string)[0]
delta = datetime.timedelta(hours=int(minute))
pubtime = curr_time - delta
return pubtime.strftime("%Y-%m-%d %H:%M")

从标题筛选出商情信息

1
2
3
4
if u"采购" in title or u"成交" in title or u"询价" in title or u"中标" in title or u"招标" in title \
or u"延期" in title or u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title \
or u"流标" in title or u"废标" in title or u"合同" in title or u"预审" in title \
or u"磋商" in title or u"单一来源" in title or u"最高限价" in title or u"中标候选人" in title:

需要带着Cookie访问的

1
2
3
4
5
6
7
8
9
10
11
12
def parse_link(self, response):
cookie_list = response.headers.getlist('Set-Cookie')
cookie = {}
for c in cookie_list:
cookie[c.split(';')[0].split('=')[0]]=[c.split(';')[0].split('=')[1]][0]
cookie['ItDoor'] = 'wdxl'
items = response.xpath("//li[@class='news-name']/a")
for item in items:
url = item.xpath("./@href").get()
final = urlparse.urljoin(response.url,url)
yield Request(url=final, method='get', headers=self.headers,cookies=cookie,
callback=self.parse_item)

判断是否是商情信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
            if u"中标" in title or u"结果" in title or u"成交" in title or u"流标" in title or u"废标" in title or u"合同" in title or\
u"变更" in title or u"答疑" in title or u"澄清" in title or u"更正" in title or u"预审" in title or u"标" in title or\
u"招标" in title or u"采购" in title or u"谈判" in title or u"询价" in title or u"磋商" in title or u"单一来源" in title or\
u"最高限价" in title or u"资格" in title:




SQ_list=['中标','结果','成交','流标','废标','合同','变更','答疑','澄清','更正','预审','标','招标','采购','谈判','询价','磋商','单一来源','最高限价','资格']

@staticmethod
def is_need(title):
if u"采购" in str(title) or u"成交" in str(title) or u"询价" in str(title) or u"中标" in str(title) or u"招标" in str(title) \
or u"延期" in str(title) or u"变更" in str(title) or u"答疑" in str(title) or u"澄清" in str(title) or u"更正" in str(title) \
or u"流标" in str(title) or u"废标" in str(title) or u"合同" in str(title) or u"预审" in str(title) or u"标" in str(title)\
or u"磋商" in str(title) or u"单一来源" in str(title) or u"最高限价" in str(title) or u"中标候选人" in str(title) :
return True
else:
return False

拼接网页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
 @staticmethod
def get_content(jsonbody):
content = u"""
<div>
<tr>
<td>来信人:</td>
<td>{}</td>
<td>来信日期:</td>
<td>{}</td>
</tr><br>
<tr>
<td>类型:</td>
<td>{}</td>
<td>编号:</td>
<td>{}</td>
</tr><br>
<tr>
<td>来信内容:</td>
<td>{}</td>
</tr><br>
<tr>
<td>办理进程:</td>
<td>{}</td>
</tr><br>
<tr>
<td>答复单位:</td>
<td>{}</td>
</tr><br>
<tr>
<td>答复日期:</td>
<td>{}</td>
</tr><br>
<tr>
<td>答复内容:</td>
<td>{}</td>
</tr><br>
</div>
""".format(jsonbody['fromName'],jsonbody['createTime'][:10],jsonbody['objectiveType'],
jsonbody['serialNumber'],jsonbody['content'],jsonbody['status'],
jsonbody['replyContents'][0]['allName'],
jsonbody['replyContents'][0]['replyTime'][:10],
jsonbody['replyContents'][0]['replyContent'],)
return content

翻页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

last_date = data_list[-1]['publishtime'][:10]
# 当前页最后一条的日期
last_date = datetime.datetime.strptime(last_date, "%Y-%m-%d")
# 今天的日期
curr_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d"),"%Y-%m-%d")
# 时间差,用来进行翻页判断
day = (curr_time - last_date).days
print last_date
print curr_time
print day
if day < 1:
# 进行翻页操作
next_page = response.meta['page'] + 1
next_page_url = re.sub("\d+.shtml",str(next_page) + ".shtml",response.url)
yield Request(url=next_page_url, method='get', headers=self.headers, dont_filter=True,
meta={"channel": response.meta["channel"], "start_hash": response.meta['start_hash'],
"page": next_page},
callback=self.parse_link)
# 翻页提示
# print "翻页,打开第%s页" % str(next_page)

时间戳转换格式

  • 时间戳转标准

    1
    2
    3
    4
    5
    6
    def time_format(timestamp):
    pubtimeArray = datetime.datetime.fromtimestamp(float(int(timestamp) / 1000))
    pubtime = pubtimeArray.strftime("%Y-%m-%d %H:%M")
    return pubtime
    # 如果是精确到毫秒的,pubtime要除1000
    datetime.datetime.fromtimestamp(float(int(pubtime))).strftime("%Y-%m-%d %H:%M")

header 格式化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/python
# -*- coding: utf-8 -*-
import hashlib
import json
aaaaa ="""
Host: szdmobile.suzhou.gov.cn
X-API-TIMESTAMP: 1626249118516wjDZfe
User-Agent: su zhou dao/1.6.0 (iPhone; iOS 14.6; Scale/2.00)
cityName:
deviceName: six
deviceCode: iPhone12,8
X-API-SIGNATURE: OTQzY2E3ZGVkMDBhN2ViOTQ3YjkwMWI0NDZmYzQ1MTVmMzdmNGY2OQ==
appVersion: 1.6.0
latitude: 31.305387
accessToken: 65705a0cb0244f1fa3ad7766540a73c1
system: ios
version: 14.6
manufacturer: Apple
deviceId: E2E6A424-5EB5-4C44-A896-2788EB37136F
sign: 2xsSTEb8o/w=
Connection: keep-alive
X-AUTH-TYPE: sha1
longitude: 120.591694
Accept-Language: zh-CN
network: WIFI
Accept: */*
Accept-Encoding: gzip, deflate, br
X-API-KEY: eecca5b6365d9607ee5a9d336962c534
X-API-VERSION: 1.6.0
registrationID: 141fe1da9e7db3c637a"""


def format_header(header_str):
hea_format = {}
str = header_str
for line in str.split("\n"):
if line:
hea_format[line.split(': ')[0]] = line.split(': ')[1]
print hea_format
return header_str

format_header(aaaaa)

不同的内容 兼容性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@staticmethod
def get_detail_info(response):
pubtime = ''
title = ''
content = ''
type1 = {
'pubtime':response.xpath("string(//div[contains(@class,'biaoti')]//following-sibling::div[1])"),
'title':response.xpath("//div[contains(@class,'biaoti')]"),
'content':response.xpath("//div[contains(@class,'zhenwen')][last()]")
}
type2 = {
'pubtime': response.xpath("string(//div[@class='left-time'])"),
'title': response.xpath("//div[@class='content']/h1/text()"),
'content': response.xpath("//div[@class='content']")
}
type_list = []
type_list.append(type1)
type_list.append(type2)
for type in type_list:
if type['title']:
pubtime = type['pubtime'].get().strip().replace('\r','').replace('\n','')
pubtime = re.findall("\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}",str(pubtime))[0]
pubtime = datetime.datetime.strptime(pubtime, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M")
title = type['title'].get()
content = type['content'].get()
return title,pubtime,content

视频插入content

1
2
3
4
5
6
7
8
9
10
11
12
"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<video width="600" height="450" preload="auto" controls>
<source src="{}" type="video/mp4">
</video>
</body>
</html>""".format(data['video_url'])

request访问时候使用代理的格式

1
2
3
4
requests.request("POST", url, data=post_data, headers=headers10, cookies=cookies,proxies={'http': proxy, 'https': proxy.replace("http","https")})

requests.request("POST", url, data=post_data, headers=headers10, cookies=cookies,proxies={'http': proxy, 'https': proxy})

Prev:
scrapy框架访问链接时,post请求的几种姿势
Next:
python 使用 openpyxl 模块控制 excel 常用操作
catalog
catalog