python 爬虫常用模块

页面下载器

  1. requests
    安装
    1
    pip install requests

GET请求

  1. 无参数get请求
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    import requests

    headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"}

    # 不带参数和headers
    ret = requests.get('https://github.com/timeline.json')

    # url直接带参数
    ret = requests.get('https://movie.douban.com/top250?start=25', headers=headers)

    # 字典构造参数
    params = {'start': 25}
    ret = requests.get('https://movie.douban.com/top250', params=params, headers=headers)

    # requests常用属性
    print(ret.url)
    print(ret.text) #解析成str类型数据
    print(ret.json()) #解析成json类型数据
    print(ret.content) # 解析成二进制数据
    print(ret.status_code)
    print(ret.encoding)
    print(ret.apparent_encoding)
    print(ret.cookies)

POST请求

1
2
3
4
5
6
7
8
9
10
import requests

data = {"key1": "values1", "key2": "values2"}

# 不带headers
ret = requests.post("http://127.0.0.1:8000/user/login/", data=data)

# 带data和headers
ret = requests.post("http://127.0.0.1:8000/user/login/", data=data, headers=headers)

特殊POST请求(文件上传)

1
2
3
4
files = {"file": open("baidu_logo.png","wb")}
ret = requests.post("http://127.0.0.1:8000/upload/",files=files)
print(ret.status_code)

保存二进制文件

1
2
3
4
5
6
7
8
# 保存百度logo图片
import requests

ret = requests.get("https://www.baidu.com/img/pc_675fe66eab33abff35a2669768c43d95.png")
with open("baidu_logo.png","wb") as f:
f.write(ret.content) # ret.content为二进制数据
f.close()

获取cookies

1
2
3
4
5
6
response = requests.get("http://www.baidu.com")
print(response.cookies)

for key, value in response.cookies.items():
print("{key} = {value}".format(key=key, value=value))

session对象发送请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import requests,json

session = requests.Session()


data = {
"user": "root",
"password": "xxxxxx"
}
ret = session.post("http://127.0.0.1:8000/user/login/",data=data)

# 获取登录token值
ret = json.loads(ret.text)
sign=ret['data']['token']

# 模拟token登录
session.get("http://127.0.0.1:8000/srv",headers={"Authorization": "jwt "+sign})

https证书验证

1
2
3
4
5
6
7
8
9
# 指定证书验证
ret = requests.get("https://127.0.0.1:8000/",cert=("cert_path"))

# 取消证书验证
from requests.packages import urllib3
urllib3.disable_warnings()

ret = requests.get("https://127.0.0.1:8000/",verify=False)

状态码status_code的判断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
ret = requests.get("http://127.0.0.1:8000")
print(ret.status_code == requests.codes.ok)
print(ret.status_code = requests.codes.unauthorized)


"""
# Informational.
100: (continue,),
101: (switching_protocols,),
102: (processing,),
103: (checkpoint,),
122: (uri_too_long, request_uri_too_long),
200: (ok, okay, all_ok, all_okay, all_good, \\o/, ✓),
201: (created,),
202: (accepted,),
203: (non_authoritative_info, non_authoritative_information),
204: (no_content,),
205: (reset_content, reset),
206: (partial_content, partial),
207: (multi_status, multiple_status, multi_stati, multiple_stati),
208: (already_reported,),
226: (im_used,),

# Redirection.
300: (multiple_choices,),
301: (moved_permanently, moved, \\o-),
302: (found,),
303: (see_other, other),
304: (not_modified,),
305: (use_proxy,),
306: (switch_proxy,),
307: (temporary_redirect, temporary_moved, temporary),
308: (permanent_redirect,
resume_incomplete, resume,), # These 2 to be removed in 3.0

# Client Error.
400: (bad_request, bad),
401: (unauthorized,),
402: (payment_required, payment),
403: (forbidden,),
404: (not_found, -o-),
405: (method_not_allowed, not_allowed),
406: (not_acceptable,),
407: (proxy_authentication_required, proxy_auth, proxy_authentication),
408: (request_timeout, timeout),
409: (conflict,),
410: (gone,),
411: (length_required,),
412: (precondition_failed, precondition),
413: (request_entity_too_large,),
414: (request_uri_too_large,),
415: (unsupported_media_type, unsupported_media, media_type),
416: (requested_range_not_satisfiable, requested_range, range_not_satisfiable),
417: (expectation_failed,),
418: (im_a_teapot, teapot, i_am_a_teapot),
421: (misdirected_request,),
422: (unprocessable_entity, unprocessable),
423: (locked,),
424: (failed_dependency, dependency),
425: (unordered_collection, unordered),
426: (upgrade_required, upgrade),
428: (precondition_required, precondition),
429: (too_many_requests, too_many),
431: (header_fields_too_large, fields_too_large),
444: (no_response, none),
449: (retry_with, retry),
450: (blocked_by_windows_parental_controls, parental_controls),
451: (unavailable_for_legal_reasons, legal_reasons),
499: (client_closed_request,),

# Server Error.
500: (internal_server_error, server_error, /o\\, ✗),
501: (not_implemented,),
502: (bad_gateway,),
503: (service_unavailable, unavailable),
504: (gateway_timeout,),
505: (http_version_not_supported, http_version),
506: (variant_also_negotiates,),
507: (insufficient_storage,),
509: (bandwidth_limit_exceeded, bandwidth),
510: (not_extended,),
511: (network_authentication_required, network_auth, network_authentication),
"""

简单的登录验证

1
2
3
4
5
6
7
8
9
10
11

a = json.loads(ret.text)
sign=a['data']['token']
ret1 = requests.get("http://127.0.0.1:8000/srv/",headers={"Authorization": "jwt "+sign})


session = requests.Session()
a = session.post("http://127.0.0.1:8000/user/login/",data={"username": "root", "password": "3e4r5t6y"})



  1. scrapy
  2. selenium
  3. Splash

页面解析器

  1. BeautifulSoup
  2. pyquery
  3. lxml

解析器
parsel
scrapy的Selector

数据存储
csv
sqlite3(python自带)
pymysql
pymongodb

其他
execjs
pyv8
html5lib