banner
肥皂的小屋

肥皂的小屋

github
steam
bilibili
douban
tg_channel

Python--爬取豆瓣TOP250電影資訊

起因#

最近在實習,要求自己找一個網站爬一下存儲數據到excel表格中

我看電影也是按豆瓣TOP250來篩選的,手動翻頁太麻煩,於是爬一下

[2019-09-02 更新] 後面要做作業答辯,改為存儲到mysql數據庫中

代碼實現#

和網上大部分爬取的文章不同,我想要的是每部電影的劇情簡介資訊

所以需要先獲取每部電影的鏈接,再單獨爬取每部電影

全部代碼如下:

# -*- coding: utf-8 -*-
'''
@author: soapffz
@fucntion: 豆瓣TOP250電影資訊爬取並存儲到mysql數據庫(多線程)
@time: 2019-09-01
'''

import requests
from fake_useragent import UserAgent
from lxml import etree
from tqdm import tqdm
import threading
import pymysql
from re import split

"""
提示庫找不到可複製以下語句解決
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple fake_useragent
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tqdm
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple threading
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pymysql
"""


class Top250(object):
    def __init__(self):
        ua = UserAgent()  # 用於生成User-Agent
        self.headers = {"User-Agent": ua.random}  # 獲得一個隨機的User-Agent
        self.bangdang_l = []  # 存儲榜單的頁面
        self.subject_url_l = []  # 存儲每部電影的鏈接
        self.connect_mysql()

    def connect_mysql(self):
        # 連接數據庫,密碼後面可以加數據庫名稱
        try:
            self.mysql_conn = pymysql.connect(
                'localhost', 'root', 'root', charset='utf8')
            # 得到一個可以執行SQL語句的光標對象,執行完畢返回的結果默認以元組顯示
            self.cursor = self.mysql_conn.cursor()
            print("數據庫連接成功")
        except Exception as e:
            print("數據庫連接出錯:{}\n你讓我爬完存哪?不爬了,退出程序!".format(e))
            exit(0)
        else:
            self.create_db()

    def create_db(self):
        # 創建數據庫和表
        # 檢測數據庫是否存在,存在則刪除,然後創建
        sql_db_dection = "DROP DATABASE IF EXISTS `douban_top250`"
        sql_create_db = "CREATE DATABASE `douban_top250` default charset utf8 COLLATE utf8_general_ci;"
        sql_create_table = """
            CREATE TABLE `movies_info` (
                `電影名稱` varchar(255) NOT NULL,
                `導演` varchar(511) NOT NULL,
                `主演資訊` varchar(511) NOT NULL,
                `類型` varchar(255) NOT NULL,
                `上映日期` varchar(255) NOT NULL,
                `劇情簡介` varchar(511) NOT NULL,
                `排名` varchar(255) NOT NULL,
                `片長` varchar(255) NOT NULL,
                PRIMARY KEY (`排名`)
            )DEFAULT CHARSET=utf8;
            """
        try:
            self.cursor.execute(sql_db_dection)
            self.cursor.execute(sql_create_db)
            self.cursor.execute('use douban_top250;')  # 將當前數據庫設置為剛創建的數據庫
            self.cursor.execute(sql_create_table)
        except Exception as e:  # 捕獲所有異常並打印,python2 是Exception,e
            print("數據庫創建出錯:{}\n退出程序!".format(e))
            self.mysql_conn.rollback()  # 發生錯誤時回滾
            self.mysql_conn.close()  # 關閉數據庫連接
            exit(0)
        else:
            print("創建數據庫和表成功,開始爬取每部電影的鏈接..")
            self.get_subject_url()

    def get_subject_url(self):
        # 遍歷250榜單獲取每部電影單獨鏈接
        self.bangdang_l = [
            "https://movie.douban.com/top250?start={}&filter=".format(i) for i in range(0, 250, 25)]
        self.multi_thread(self.bangdang_l, self.crawl_bangdang)
        if len(self.subject_url_l) == 0:
            print("ip被封了,程序退出")
            exit(0)
        else:
            print("{}部TOP電影的鏈接已獲取完畢,開始爬取單部電影,請稍後...".format(
                len(self.subject_url_l)))
            self.multi_thread(self.subject_url_l, self.get_one_movie_info)

    def crawl_bangdang(self, url):
        # 爬取每個榜單頁面的電影鏈接
        try:
            req = requests.get(url, headers=self.headers)
            req.encoding = "utf-8"
            html = etree.HTML(req.text)
            # 獲得所有a標籤列表
            url_l = html.xpath('//div[@class="hd"]//a/@href')
            self.subject_url_l.extend(url_l)
        except Exception as e:
            print("爬取榜單資訊時報錯:{}".format(e))
        else:
            print("獲取第{}頁的榜單數據成功...\n".format(
                int(int(split(r"=|&", url)[1])/25 + 1)))

    def multi_thread(self, url_l, target_func):
        # 多線程爬取函數,傳入需要爬取的url列表以及用來爬取的函數
        threads = []
        for i in range(len(url_l)):
            t = threading.Thread(target=target_func, args=(url_l[i],))
            threads.append(t)
        for i in range(len(threads)):
            threads[i].start()
        for i in range(len(threads)):
            threads[i].join()
        print("爬取完畢")

    def get_one_movie_info(self, subject):
        # 爬取單個url的函數
        try:
            req = requests.get(subject, headers=self.headers)
            html = etree.HTML(req.content)
        except Exception as e:
            print("爬取報錯:".format(e))
        else:
            # 用來存儲單個電影的資訊
            info = []
            # 電影名稱資訊
            movie_name = html.xpath(
                "//span[@property='v:itemreviewed']/text()")
            info.append(" ".join(movie_name))
            # 導演資訊
            director = html.xpath("//a[@rel='v:directedBy']//text()")
            info.append(" ".join(director))
            # 主演資訊
            actor = html.xpath("//a[@rel='v:starring']//text()")
            info.append(" ".join(actor))
            # 類型
            genre = html.xpath("//span[@property='v:genre']/text()")
            info.append(" ".join(genre))
            # 上映日期
            initialReleaseDate = html.xpath(
                "//span[@property='v:initialReleaseDate']/text()")
            info.append(" ".join(initialReleaseDate))
            # 劇情簡介
            reated_info = html.xpath("//span[@class='all hidden']/text()")
            # 有的劇情簡介是隱藏的,默認獲取隱藏的標籤,如果沒隱藏獲取沒隱藏的標籤
            if len(reated_info) == 0:
                reated_info = html.xpath(
                    "//span[@property='v:summary']/text()")
            reated_info = "".join([s.strip() for s in reated_info]).strip("\\")
            reated_info = self.transferContent(reated_info)
            info.append(reated_info)
            # 排名
            no = html.xpath("//span[@class='top250-no']/text()")
            if len(no) == 1:
                info.append(no[0].split(".")[-1])
            else:
                info.append("獲取失敗")
            runtime = html.xpath("//span[@property='v:runtime']/text()")
            if len(runtime) == 1:
                info.append(runtime[0].split("分鐘")[0])
            else:
                info.append("獲取失敗")
            self.db_insert(info)

    def db_insert(self, info_l):
        sql_insert_detection = """
            insert ignore into `douban_top250`.`movies_info` (`電影名稱`,`導演`,`主演資訊`,`類型`,`上映日期`,`劇情簡介`,`排名`,`片長`)
            values ("{l[0]}","{l[1]}","{l[2]}","{l[3]}","{l[4]}","{l[5]}","{l[6]}","{l[7]}");
        """.format(l=info_l)
        try:
            self.cursor.execute(sql_insert_detection)
            self.mysql_conn.commit()
        except Exception as e:
            self.mysql_conn.rollback()
            print("導入數據到數據庫時報錯:{}".format(e))
            exit(0)
        else:
            print("{}獲取資訊成功...\n".format(info_l[0]))

    def transferContent(self, content):
        # 將同時含有'和"的字符串都轉義
        if content is None:
            return None
        else:
            string = ""
            for c in content:
                if c == '"':
                    string += '\\\"'
                elif c == "'":
                    string += "\\\'"
                elif c == "\\":
                    string += "\\\\"
                else:
                    string += c
            return string


if __name__ == "__main__":
    Top250()

演示 gif 如下:

image

參考文章:

載入中......
此文章數據所有權由區塊鏈加密技術和智能合約保障僅歸創作者所有。