32 lines
837 B
Python
32 lines
837 B
Python
|
from urllib.parse import urljoin
|
||
|
|
||
|
import re
|
||
|
import requests
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
def main():
|
||
|
headers = {'user-agent': 'Baiduspider'}
|
||
|
proxies = {
|
||
|
'http': 'http://122.114.31.177:808'
|
||
|
}
|
||
|
base_url = 'https://www.zhihu.com/'
|
||
|
seed_url = urljoin(base_url, 'explore')
|
||
|
resp = requests.get(seed_url,
|
||
|
headers=headers,
|
||
|
proxies=proxies)
|
||
|
soup = BeautifulSoup(resp.text, 'lxml')
|
||
|
href_regex = re.compile(r'^/question')
|
||
|
link_set = set()
|
||
|
for a_tag in soup.find_all('a', {'href': href_regex}):
|
||
|
if 'href' in a_tag.attrs:
|
||
|
href = a_tag.attrs['href']
|
||
|
full_url = urljoin(base_url, href)
|
||
|
link_set.add(full_url)
|
||
|
print('Total %d question pages found.' % len(link_set))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|