使用代理池处理反爬抓取微信文章

本文最后更新于:2021年1月27日 晚上

搜狗(http://weixin.sogou.com/)已经为我们做了一层微信文章的爬取,通过它我们可以获取一些微信文章的列表以及微信公众号的一些信息,
但是它有很多反爬虫的措施,可以检测到你的IP异常,然后把你封掉。
本文采用代理的方法处理反爬来抓取微信文章。

用到的

  • urllib
  • lxml
  • pyquery
  • requests
  • re
  • pymongo
  • sys

代理池

效果图

实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# -*- coding: utf-8 -*-
"""
@Time : 2021/1/27 12:29
@Auth : Ne-21
@File :weixin_spider.py
@IDE :PyCharm
@Motto:Another me.

"""
from urllib.parse import urlencode
from lxml.etree import XMLSyntaxError
from pyquery import PyQuery as pq
import requests
import re
import pymongo
import sys

sys.setrecursionlimit(3000) # 将默认的递归深度修改为3000

base_url = 'https://weixin.sogou.com/weixin?'
headers = {
'Referer': 'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=1&ie=utf8',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 '
'Safari/537.36',
'Host': 'weixin.sogou.com',
'Cookie': ''
}
keyword = '云计算'

# 获取、设置全局代理
proxy_pool_url = 'http://127.0.0.1:5555/random'
proxy = None

# 最大请求次数
max_count = 5

# MongoDB
MONGO_URL = 'localhost'
MONGO_DB = 'weixin'
MONGO_TABLE = 'articles'

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

# 获取代理信息
def get_proxy():
try:
response = requests.get(proxy_pool_url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
return None


# 搜索
def get_index(keyword, page):
data = {
'query': keyword,
'page': page,
'type': '2'
}
queries = urlencode(data)
url = base_url + queries
html = get_html(url)
return html


# 获取搜索页面信息
def get_html(url, count=1):
print('请求目标网址', url)
print('当前请求次数', count)
global proxy
# 判断请求次数
if count == max_count:
print('请求次数过多')
return None
try:
# 是否有代理
if proxy:
proxies = {
'http': 'http://' + proxy
}
response = requests.get(url, headers=headers, allow_redirects=False, proxies=proxies)
else:
# allow_redirects=False 不处理自动跳转
response = requests.get(url, headers=headers, allow_redirects=False)
if response.status_code == 200:
return response.text
if response.status_code == 302:
# need proxy
print('302')
proxy = get_proxy()
if proxy:
print('Using Proxy', proxy)
# count += 1
return get_html(url)
else:
print('Get Proxy Failed')
return None
except ConnectionError as e:
print('Error', e.args)
proxy = get_proxy()
count += 1
return get_html(url, count)


# 解析搜索页面
def parse_index(html):
doc = pq(html)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
yield str('https://weixin.sogou.com') + item.attr('href')


# 获取真实的文章地址
def get_detail_true_url(url):
global proxy
try:
if proxy:
proxies = {
'http': 'http://' + str(proxy)
}
response = requests.get(url, headers=headers, allow_redirects=False, proxies=proxies)
else:
# allow_redirects=False 不处理自动跳转
response = requests.get(url, headers=headers, allow_redirects=False)
if response.status_code == 200:
response = response.text
url_param = re.compile("url \+= '(.*?)';", re.S).findall(response)
true_url = ''
for i in url_param:
true_url += i
true_url.replace('@', '')
return true_url
except:
return get_detail_true_url(url)


# 获取文章详情
def get_detail(true_url):
try:
response = requests.get(true_url)
if response.status_code == 200:
response = response.text
return response
return None
except ConnectionError:
return None


# 解析文章
def parse_detail(html):
try:
doc = pq(html)
title = doc('.rich_media_title').text()
content = doc('.rich_media_content').text()
date = doc('#publish_time').text()
nickname = doc('#js_name').text()
wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
return {
'title': title,
'content': content,
'date': date,
'nickname': nickname,
'wechat': wechat
}
except XMLSyntaxError:
return None



def save_to_mongo(data):
# 如果有一样的,更新,没有,插入
if db[MONGO_TABLE].update({'title': data['title']}, {'$set': data}, True):
print('存储到数据库成功', data['title'])
else:
print('存储失败', data['title'])

def main():
for page in range(1, 100):
html = get_index(keyword, page)
if html:
article_urls = parse_index(html)
for article_url in article_urls:
article_html = get_detail_true_url(article_url)
article_true_html = get_detail(article_html)
if article_true_html:
article_data = parse_detail(article_true_html)
print(article_data)
if article_data:
save_to_mongo(article_data)


if __name__ == '__main__':
main()