-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Expand file tree
/
Copy pathtaiyangdaili.py
More file actions
31 lines (25 loc) · 876 Bytes
/
taiyangdaili.py
File metadata and controls
31 lines (25 loc) · 876 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from pyquery import PyQuery as pq
BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
MAX_PAGE = 5 * 2
class TaiyangdailiCrawler(BaseCrawler):
"""
taiyangdaili crawler, http://www.taiyanghttp.com/free/
"""
urls = [BaseUrl.format(num=i) for i in range(1, 6)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('#ip_list .tr.ip_tr').items()
for tr in trs:
host = tr.find('div:nth-child(1)').text()
port = tr.find('div:nth-child(2)').text()
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = TaiyangdailiCrawler()
for proxy in crawler.crawl():
print(proxy)