From 2520ad914317994d92c92b970fd8bc4f965fe880 Mon Sep 17 00:00:00 2001 From: injetlee Date: Sun, 2 Sep 2018 17:35:12 +0800 Subject: [PATCH] lagou.py --- Crawer/README.MD | 2 - Crawer/meizitu.py | 77 -------------------------------------- Crawer/qiubai_crawer.py | 54 --------------------------- 爬虫集合/lagou.py | 83 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 133 deletions(-) delete mode 100644 Crawer/README.MD delete mode 100644 Crawer/meizitu.py delete mode 100644 Crawer/qiubai_crawer.py create mode 100644 爬虫集合/lagou.py diff --git a/Crawer/README.MD b/Crawer/README.MD deleted file mode 100644 index b73228f..0000000 --- a/Crawer/README.MD +++ /dev/null @@ -1,2 +0,0 @@ -# 代码详细说明请看文章 - diff --git a/Crawer/meizitu.py b/Crawer/meizitu.py deleted file mode 100644 index e26a83d..0000000 --- a/Crawer/meizitu.py +++ /dev/null @@ -1,77 +0,0 @@ -import requests -import os -import time -import threading -from bs4 import BeautifulSoup - - -def download_page(url): - ''' - 用于下载页面 - ''' - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - r = requests.get(url, headers=headers) - r.encoding = 'gb2312' - return r.text - - -def get_pic_list(html): - ''' - 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 - ''' - soup = BeautifulSoup(html, 'html.parser') - pic_list = soup.find_all('li', class_='wp-item') - for i in pic_list: - a_tag = i.find('h3', class_='tit').find('a') - link = a_tag.get('href') - text = a_tag.get_text() - get_pic(link, text) - - -def get_pic(link, text): - ''' - 获取当前页面的图片,并保存 - ''' - html = download_page(link) # 下载界面 - soup = BeautifulSoup(html, 'html.parser') - pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - create_dir('pic/{}'.format(text)) - for i in pic_list: - pic_link = i.get('src') # 拿到图片的具体 url - r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 - with open('pic/{}/{}'.format(text, link.split('/')[-1]), 'wb') as f: - f.write(r.content) - time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 - - -def create_dir(name): - if not os.path.exists(name): - os.makedirs(name) - - -def execute(url): - page_html = download_page(url) - get_pic_list(page_html) - - -def main(): - create_dir('pic') - queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 - threads = [] - while len(queue) > 0: - for thread in threads: - if not thread.is_alive(): - threads.remove(thread) - while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 - cur_page = queue.pop(0) - url = 'http://meizitu.com/a/more_{}.html'.format(cur_page) - thread = threading.Thread(target=execute, args=(url,)) - thread.setDaemon(True) - thread.start() - print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) - threads.append(thread) - - -if __name__ == '__main__': - main() diff --git a/Crawer/qiubai_crawer.py b/Crawer/qiubai_crawer.py deleted file mode 100644 index e37e7e7..0000000 --- a/Crawer/qiubai_crawer.py +++ /dev/null @@ -1,54 +0,0 @@ -import requests -from bs4 import BeautifulSoup - - -def download_page(url): - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - r = requests.get(url, headers=headers) - return r.text - - -def get_content(html, page): - output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" - soup = BeautifulSoup(html, 'html.parser') - con = soup.find(id='content-left') - con_list = con.find_all('div', class_="article") - for i in con_list: - author = i.find('h2').string # 获取作者名字 - content = i.find('div', class_='content').find('span').get_text() # 获取内容 - stats = i.find('div', class_='stats') - vote = stats.find('span', class_='stats-vote').find('i', class_='number').string - comment = stats.find('span', class_='stats-comments').find('i', class_='number').string - author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 - if author_info is not None: # 非匿名用户 - class_list = author_info['class'] - if "womenIcon" in class_list: - gender = '女' - elif "manIcon" in class_list: - gender = '男' - else: - gender = '' - age = author_info.string # 获取年龄 - else: # 匿名用户 - gender = '' - age = '' - - save_txt(output.format(page, author, gender, age, vote, comment, content)) - - -def save_txt(*args): - for i in args: - with open('qiubai.txt', 'a', encoding='utf-8') as f: - f.write(i) - - -def main(): - # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, - # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 - for i in range(1, 14): - url = 'https://qiushibaike.com/text/page/{}'.format(i) - html = download_page(url) - get_content(html, i) - -if __name__ == '__main__': - main() diff --git a/爬虫集合/lagou.py b/爬虫集合/lagou.py new file mode 100644 index 0000000..3a7a55c --- /dev/null +++ b/爬虫集合/lagou.py @@ -0,0 +1,83 @@ +import random +import time + +import requests +from openpyxl import Workbook +import pymysql.cursors + + +def get_conn(): + '''建立数据库连接''' + conn = pymysql.connect(host='localhost', + user='root', + password='root', + db='python', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + return conn + + +def insert(conn, info): + '''数据写入数据库''' + with conn.cursor() as cursor: + sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute(sql, info) + conn.commit() + + +def get_json(url, page, lang_name): + '''返回当前页面的信息列表''' + headers = { + 'Host': 'www.lagou.com', + 'Connection': 'keep-alive', + 'Content-Length': '23', + 'Origin': 'https://www.lagou.com', + 'X-Anit-Forge-Code': '0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + 'X-Anit-Forge-Token': 'None', + 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' + } + data = {'first': 'false', 'pn': page, 'kd': lang_name} + json = requests.post(url, data, headers=headers).json() + list_con = json['content']['positionResult']['result'] + info_list = [] + for i in list_con: + info = [] + info.append(i.get('companyShortName', '无')) + info.append(i.get('companyFullName', '无')) + info.append(i.get('industryField', '无')) + info.append(i.get('companySize', '无')) + info.append(i.get('salary', '无')) + info.append(i.get('city', '无')) + info.append(i.get('education', '无')) + info_list.append(info) + return info_list + + +def main(): + lang_name = 'python' + wb = Workbook() # 打开 excel 工作簿 + conn = get_conn() # 建立数据库连接 不存数据库 注释此行 + for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 + page = 1 + ws1 = wb.active + ws1.title = lang_name + url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) + while page < 31: # 每个城市30页信息 + info = get_json(url, page, lang_name) + page += 1 + print(i, 'page', page) + time.sleep(random.randint(10, 20)) + for row in info: + # insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 + ws1.append(row) + conn.close() # 关闭数据库连接,不存数据库 注释此行 + wb.save('{}职位信息.xlsx'.format(lang_name)) + +if __name__ == '__main__': + main() \ No newline at end of file