python爬虫

URL

URL的一般格式(带方括号[]的为可选项)

protocol://hostname[:port]/path/[;Parameters][?query]#fragment

URL由三部分组成:

  • 第一部分是协议:http,https,ftp,file,ed2k…
  • 第二部分是存放资源的服务器的域名糸统或IP地址
    (有肘候要包含端口号,各种传输协议都有默认的端口号,如http的默认端口为88)
  • 第三部分是资源的具体地址,如目录或文件名等
    url + lib = urllib

python实例 1:下载猫图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 导入 os模块和 urllib.request模块
# (os模块用户文件的存取,urllib.request用于打开URL的可扩展库)
import os
import urllib.request

# 直接读取URL
response = urllib.request.urlopen('http://placekitten.com/g/700/600')
cat_img = response.read()
# 使用with open语法,python会自动调用close()关闭文件
with open ('cat_700_600.jpg','wb') as f:
f.write(cat_img)

# 实例化 Request 类得到 req 对象
req = urllib.request.Request('http://placekitten.com/g/900/600')
response = urllib.request.urlopen(req)
cat_img = response.read()
with open ('cat_900_600.jpg','wb') as f:
f.write(cat_img)

# 通过geturl获取url的地址
response.geturl()
print(response.geturl())
# 通过info获取页面信息
response.info()
print(response.info())
# 通过getcode获取HTTP状态代码,200则为正常
response.getcode()
print(response.getcode())

python实例 2:有道翻译

浏览器右键审查元素/审查,Networks > method,选择为post的Name单元

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
General
# 实现网页机制的实际地址
Request URL: http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule
# 请求方法
Request Method: POST
Status Code: 200 OK
#服务器ip及打开的端口号
Remote Address: 220.181.76.83:80
Referrer Policy: no-referrer-when-downgrade

# 客户端发送数据请求的headers,常常被服务器端用于检测是否是非人类访问
Request Headers
# 常通过用户代理来判断,如果是python直接访问网页则显示为python
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36

# 客户端提交数据的主要内容
From Data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# 打开和浏览url中内容
import urllib.request
# url的解析,合并,编码,解码
import urllib.parse
import json

url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
content = input('请输入需要翻译的内容:')
# 以网页审查元素中的From Data为源创建字典
data = {'i': content,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15739061823836',
'sign': '682b8fd39bb15ad56d2a3b1e9b0c54f1',
'ts': '1573906182383',
'bv': '75551116684a442e8625ebfc9e5af1ba',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'}

# 数据解码,并且使用utf-8进行编码
data = urllib.parse.urlencode(data).encode('utf-8')
#
response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')
# 使用json对html进行解码
target = json.loads(html)
print('翻译的结果是:%s' % (target['translateResult'][0][0]['tgt']))

修改header

Request对象生成之前通过设置head参数来修改
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import urllib.request
import urllib.parse
import json

content = input('请输入需要翻译的内容:')
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'

data = {'i': content,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15739061823836',
'sign': '682b8fd39bb15ad56d2a3b1e9b0c54f1',
'ts': '1573906182383',
'bv': '75551116684a442e8625ebfc9e5af1ba',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'}

data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url,data,head)

response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
target = json.loads(html)
print('翻译的结果是:%s' % (target['translateResult'][0][0]['tgt']))
Request对象生成之后通过add_header()进行修改
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import urllib.request
import urllib.parse
import json

content = input('请输入需要翻译的内容:')
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

data = {'i': content,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15739061823836',
'sign': '682b8fd39bb15ad56d2a3b1e9b0c54f1',
'ts': '1573906182383',
'bv': '75551116684a442e8625ebfc9e5af1ba',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'}

data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url,data)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36')

response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
target = json.loads(html)
print('翻译的结果是:%s' % (target['translateResult'][0][0]['tgt']))

如何对抗网页通过IP访问次数限制爬虫

设置延迟提交时间
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import urllib.request
import urllib.parse
import json
import time


while True:
content = input('请输入需要翻译的内容(输入 “Q” 结束程序):')
if content == 'Q':
print('程序结束!')
break
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

data = {'i': content,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15739061823836',
'sign': '682b8fd39bb15ad56d2a3b1e9b0c54f1',
'ts': '1573906182383',
'bv': '75551116684a442e8625ebfc9e5af1ba',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'}

data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url,data)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36')

response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
target = json.loads(html)
print('翻译的结果是:%s' % (target['translateResult'][0][0]['tgt']))

#设置程序休眠时间
time.sleep(5)
使用代理
设置代理的步骤
  1. 参数是一个字典{ ‘类型’ : ‘代理ip端口号’ }
    proxy_support = urllib.request. ProxyHandler({})
  2. 定制、创建一个opener
    opener = urllib.request.build_opener(proxy_support)
    3.安装opener
    urllib.request.install_opener(opener)
    4.调用opener
    opener.open(url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import urllib.request
import random


url = 'http://www.gpsspg.com/ip/'

iplist =['125.43.64.150:28027',
'125.43.64.150:28028',
'183.52.107.77:37083',
'183.52.107.77:37084',
'58.19.82.115:60838']

proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})

opener = urllib.request.build_opener(proxy_support)

urllib.request.install_opener(opener)

opener.add_header = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36')]

response = urllib.request.urlopen(url)

html = response.read().decode('utf-8')

print(html)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
Traceback (most recent call last):
File "E:\Program File\python\python38\lib\urllib\request.py", line 1317, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "E:\Program File\python\python38\lib\http\client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "E:\Program File\python\python38\lib\http\client.py", line 1276, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "E:\Program File\python\python38\lib\http\client.py", line 1225, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "E:\Program File\python\python38\lib\http\client.py", line 1004, in _send_output
self.send(msg)
File "E:\Program File\python\python38\lib\http\client.py", line 944, in send
self.connect()
File "E:\Program File\python\python38\lib\http\client.py", line 915, in connect
self.sock = self._create_connection(
File "E:\Program File\python\python38\lib\socket.py", line 807, in create_connection
raise err
File "E:\Program File\python\python38\lib\socket.py", line 796, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "e:/Data/py_data/Untitled-2.py", line 21, in <module>
response = urllib.request.urlopen(url)
File "E:\Program File\python\python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "E:\Program File\python\python38\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "E:\Program File\python\python38\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "E:\Program File\python\python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "E:\Program File\python\python38\lib\urllib\request.py", line 1346, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "E:\Program File\python\python38\lib\urllib\request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>
致清 wechat
微信扫描二维码,和我一起探讨学习!