Python/douban_movie.py

#!/usr/bin/env python
# encoding=utf-8
import requests
import re
import codecs
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook()
dest_filename = '电影.xlsx'
ws1 = wb.active
ws1.title = "电影top250"

DOWNLOAD_URL = 'http://movie.douban.com/top250/'


def download_page(url):
    """获取url地址页面内容"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    data = requests.get(url, headers=headers).content
    return data


def get_li(doc):
    soup = BeautifulSoup(doc, 'html.parser')
    ol = soup.find('ol', class_='grid_view')
    name = []  # 名字
    star_con = []  # 评价人数
    score = []  # 评分
    info_list = []  # 短评
    for i in ol.find_all('li'):
        detail = i.find('div', attrs={'class': 'hd'})
        movie_name = detail.find(
            'span', attrs={'class': 'title'}).get_text()  # 电影名字
        level_star = i.find(
            'span', attrs={'class': 'rating_num'}).get_text()  # 评分
        star = i.find('div', attrs={'class': 'star'})
        star_num = star.find(text=re.compile('评价'))  # 评价

        info = i.find('span', attrs={'class': 'inq'})  # 短评
        if info:  # 判断是否有短评
            info_list.append(info.get_text())
        else:
            info_list.append('无')
        score.append(level_star)

        name.append(movie_name)
        star_con.append(star_num)
    page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
    if page:
        return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
    return name, star_con, score, info_list, None


def main():
    url = DOWNLOAD_URL
    name = []
    star_con = []
    score = []
    info = []
    while url:
        doc = download_page(url)
        movie, star, level_num, info_list, url = get_li(doc)
        name = name + movie
        star_con = star_con + star
        score = score + level_num
        info = info + info_list
    for (i, m, o, p) in zip(name, star_con, score, info):
        col_A = 'A%s' % (name.index(i) + 1)
        col_B = 'B%s' % (name.index(i) + 1)
        col_C = 'C%s' % (name.index(i) + 1)
        col_D = 'D%s' % (name.index(i) + 1)
        ws1[col_A] = i
        ws1[col_B] = m
        ws1[col_C] = o
        ws1[col_D] = p
    wb.save(filename=dest_filename)


if __name__ == '__main__':
    main()
0 2016-06-23 23:27:21 +08:00			`#!/usr/bin/env python`
			`# encoding=utf-8`
12.9 2016-12-09 14:22:52 +08:00			`import requests`
			`import re`
0 2016-06-23 23:27:21 +08:00			`import codecs`
			`from bs4 import BeautifulSoup`
			`from openpyxl import Workbook`
			`wb = Workbook()`
			`dest_filename = '电影.xlsx'`
12.9 2016-12-09 14:22:52 +08:00			`ws1 = wb.active`
0 2016-06-23 23:27:21 +08:00			`ws1.title = "电影top250"`

			`DOWNLOAD_URL = 'http://movie.douban.com/top250/'`


			`def download_page(url):`
			`"""获取url地址页面内容"""`
			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'`
			`}`
			`data = requests.get(url, headers=headers).content`
			`return data`


			`def get_li(doc):`
			`soup = BeautifulSoup(doc, 'html.parser')`
			`ol = soup.find('ol', class_='grid_view')`
12.9 2016-12-09 14:22:52 +08:00			`name = [] # 名字`
			`star_con = [] # 评价人数`
			`score = [] # 评分`
			`info_list = [] # 短评`
0 2016-06-23 23:27:21 +08:00			`for i in ol.find_all('li'):`
			`detail = i.find('div', attrs={'class': 'hd'})`
12.9 2016-12-09 14:22:52 +08:00			`movie_name = detail.find(`
			`'span', attrs={'class': 'title'}).get_text() # 电影名字`
			`level_star = i.find(`
			`'span', attrs={'class': 'rating_num'}).get_text() # 评分`
			`star = i.find('div', attrs={'class': 'star'})`
			`star_num = star.find(text=re.compile('评价')) # 评价`
0 2016-06-23 23:27:21 +08:00
12.9 2016-12-09 14:22:52 +08:00			`info = i.find('span', attrs={'class': 'inq'}) # 短评`
			`if info: # 判断是否有短评`
0 2016-06-23 23:27:21 +08:00			`info_list.append(info.get_text())`
			`else:`
			`info_list.append('无')`
			`score.append(level_star)`

			`name.append(movie_name)`
			`star_con.append(star_num)`
12.9 2016-12-09 14:22:52 +08:00			`page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页`
0 2016-06-23 23:27:21 +08:00			`if page:`
12.9 2016-12-09 14:22:52 +08:00			`return name, star_con, score, info_list, DOWNLOAD_URL + page['href']`
			`return name, star_con, score, info_list, None`
0 2016-06-23 23:27:21 +08:00

			`def main():`
			`url = DOWNLOAD_URL`
			`name = []`
12.9 2016-12-09 14:22:52 +08:00			`star_con = []`
0 2016-06-23 23:27:21 +08:00			`score = []`
			`info = []`
			`while url:`
			`doc = download_page(url)`
12.9 2016-12-09 14:22:52 +08:00			`movie, star, level_num, info_list, url = get_li(doc)`
0 2016-06-23 23:27:21 +08:00			`name = name + movie`
			`star_con = star_con + star`
12.9 2016-12-09 14:22:52 +08:00			`score = score + level_num`
			`info = info + info_list`
			`for (i, m, o, p) in zip(name, star_con, score, info):`
			`col_A = 'A%s' % (name.index(i) + 1)`
			`col_B = 'B%s' % (name.index(i) + 1)`
			`col_C = 'C%s' % (name.index(i) + 1)`
			`col_D = 'D%s' % (name.index(i) + 1)`
			`ws1[col_A] = i`
0 2016-06-23 23:27:21 +08:00			`ws1[col_B] = m`
			`ws1[col_C] = o`
			`ws1[col_D] = p`
			`wb.save(filename=dest_filename)`

12.9 2016-12-09 14:22:52 +08:00
0 2016-06-23 23:27:21 +08:00			`if __name__ == '__main__':`
			`main()`