Python/爬虫集合/lagou.py

83 lines
3.1 KiB
Python

import random
import time
import requests
from openpyxl import Workbook
import pymysql.cursors
def get_conn():
'''建立数据库连接'''
conn = pymysql.connect(host='localhost',
user='root',
password='root',
db='python',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
return conn
def insert(conn, info):
'''数据写入数据库'''
with conn.cursor() as cursor:
sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, info)
conn.commit()
def get_json(url, page, lang_name):
'''返回当前页面的信息列表'''
headers = {
'Host': 'www.lagou.com',
'Connection': 'keep-alive',
'Content-Length': '23',
'Origin': 'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'X-Anit-Forge-Token': 'None',
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
}
data = {'first': 'false', 'pn': page, 'kd': lang_name}
json = requests.post(url, data, headers=headers).json()
list_con = json['content']['positionResult']['result']
info_list = []
for i in list_con:
info = []
info.append(i.get('companyShortName', ''))
info.append(i.get('companyFullName', ''))
info.append(i.get('industryField', ''))
info.append(i.get('companySize', ''))
info.append(i.get('salary', ''))
info.append(i.get('city', ''))
info.append(i.get('education', ''))
info_list.append(info)
return info_list
def main():
lang_name = 'python'
wb = Workbook() # 打开 excel 工作簿
conn = get_conn() # 建立数据库连接 不存数据库 注释此行
for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市
page = 1
ws1 = wb.active
ws1.title = lang_name
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
while page < 31: # 每个城市30页信息
info = get_json(url, page, lang_name)
page += 1
print(i, 'page', page)
time.sleep(random.randint(10, 20))
for row in info:
insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行
ws1.append(row)
conn.close() # 关闭数据库连接,不存数据库 注释此行
wb.save('{}职位信息.xlsx'.format(lang_name))
if __name__ == '__main__':
main()