banner
肥皂的小屋

肥皂的小屋

github
steam
bilibili
douban
tg_channel

Python - Scraping Douban TOP 250 Movie Information

Cause#

Recently during my internship, I was required to find a website to scrape and store data into an excel spreadsheet.

I also filter movies based on Douban's TOP250, and manually flipping pages is too cumbersome, so I decided to scrape it.

[2019-09-02 Update] Later, I needed to do an assignment defense, so I changed it to store the data in a mysql database.

Code Implementation#

Unlike most scraping articles online, what I want is the plot summary information for each movie.

So I need to first get the link for each movie, and then scrape each movie individually.

The complete code is as follows:

# -*- coding: utf-8 -*-
'''
@author: soapffz
@function: Scraping Douban TOP250 movie information and storing it in a mysql database (multithreading)
@time: 2019-09-01
'''

import requests
from fake_useragent import UserAgent
from lxml import etree
from tqdm import tqdm
import threading
import pymysql
from re import split

"""
If the library is not found, you can copy the following statements to solve it
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple fake_useragent
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tqdm
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple threading
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pymysql
"""


class Top250(object):
    def __init__(self):
        ua = UserAgent()  # Used to generate User-Agent
        self.headers = {"User-Agent": ua.random}  # Get a random User-Agent
        self.bangdang_l = []  # Store the pages of the list
        self.subject_url_l = []  # Store the link for each movie
        self.connect_mysql()

    def connect_mysql(self):
        # Connect to the database, the password can be followed by the database name
        try:
            self.mysql_conn = pymysql.connect(
                'localhost', 'root', 'root', charset='utf8')
            # Get a cursor object that can execute SQL statements, the results returned by default are displayed as tuples
            self.cursor = self.mysql_conn.cursor()
            print("Database connection successful")
        except Exception as e:
            print("Database connection error:{}\nWhere do you want me to store the scraped data? Exiting the program!".format(e))
            exit(0)
        else:
            self.create_db()

    def create_db(self):
        # Create database and table
        # Check if the database exists, if it does, delete it and then create it
        sql_db_dection = "DROP DATABASE IF EXISTS `douban_top250`"
        sql_create_db = "CREATE DATABASE `douban_top250` default charset utf8 COLLATE utf8_general_ci;"
        sql_create_table = """
            CREATE TABLE `movies_info` (
                `Movie Name` varchar(255) NOT NULL,
                `Director` varchar(511) NOT NULL,
                `Starring Information` varchar(511) NOT NULL,
                `Genre` varchar(255) NOT NULL,
                `Release Date` varchar(255) NOT NULL,
                `Plot Summary` varchar(511) NOT NULL,
                `Ranking` varchar(255) NOT NULL,
                `Duration` varchar(255) NOT NULL,
                PRIMARY KEY (`Ranking`)
            )DEFAULT CHARSET=utf8;
            """
        try:
            self.cursor.execute(sql_db_dection)
            self.cursor.execute(sql_create_db)
            self.cursor.execute('use douban_top250;')  # Set the current database to the newly created database
            self.cursor.execute(sql_create_table)
        except Exception as e:  # Catch all exceptions and print, python2 is Exception,e
            print("Database creation error:{}\nExiting the program!".format(e))
            self.mysql_conn.rollback()  # Rollback on error
            self.mysql_conn.close()  # Close the database connection
            exit(0)
        else:
            print("Database and table created successfully, starting to scrape each movie's link..")
            self.get_subject_url()

    def get_subject_url(self):
        # Iterate through the 250 list to get individual links for each movie
        self.bangdang_l = [
            "https://movie.douban.com/top250?start={}&filter=".format(i) for i in range(0, 250, 25)]
        self.multi_thread(self.bangdang_l, self.crawl_bangdang)
        if len(self.subject_url_l) == 0:
            print("IP has been blocked, exiting the program")
            exit(0)
        else:
            print("{} TOP movie links have been successfully obtained, starting to scrape individual movies, please wait...".format(
                len(self.subject_url_l)))
            self.multi_thread(self.subject_url_l, self.get_one_movie_info)

    def crawl_bangdang(self, url):
        # Scrape the movie links from each list page
        try:
            req = requests.get(url, headers=self.headers)
            req.encoding = "utf-8"
            html = etree.HTML(req.text)
            # Get the list of all a tags
            url_l = html.xpath('//div[@class="hd"]//a/@href')
            self.subject_url_l.extend(url_l)
        except Exception as e:
            print("Error while scraping list information: {}".format(e))
        else:
            print("Successfully obtained list data from page {}...\n".format(
                int(int(split(r"=|&", url)[1])/25 + 1)))

    def multi_thread(self, url_l, target_func):
        # Multithreading scraping function, passing in the list of URLs to scrape and the function to use for scraping
        threads = []
        for i in range(len(url_l)):
            t = threading.Thread(target=target_func, args=(url_l[i],))
            threads.append(t)
        for i in range(len(threads)):
            threads[i].start()
        for i in range(len(threads)):
            threads[i].join()
        print("Scraping completed")

    def get_one_movie_info(self, subject):
        # Function to scrape a single URL
        try:
            req = requests.get(subject, headers=self.headers)
            html = etree.HTML(req.content)
        except Exception as e:
            print("Scraping error:".format(e))
        else:
            # Used to store information about a single movie
            info = []
            # Movie name information
            movie_name = html.xpath(
                "//span[@property='v:itemreviewed']/text()")
            info.append(" ".join(movie_name))
            # Director information
            director = html.xpath("//a[@rel='v:directedBy']//text()")
            info.append(" ".join(director))
            # Starring information
            actor = html.xpath("//a[@rel='v:starring']//text()")
            info.append(" ".join(actor))
            # Genre
            genre = html.xpath("//span[@property='v:genre']/text()")
            info.append(" ".join(genre))
            # Release date
            initialReleaseDate = html.xpath(
                "//span[@property='v:initialReleaseDate']/text()")
            info.append(" ".join(initialReleaseDate))
            # Plot summary
            reated_info = html.xpath("//span[@class='all hidden']/text()")
            # Some plot summaries are hidden, by default get the hidden tags, if not hidden get the non-hidden tags
            if len(reated_info) == 0:
                reated_info = html.xpath(
                    "//span[@property='v:summary']/text()")
            reated_info = "".join([s.strip() for s in reated_info]).strip("\\")
            reated_info = self.transferContent(reated_info)
            info.append(reated_info)
            # Ranking
            no = html.xpath("//span[@class='top250-no']/text()")
            if len(no) == 1:
                info.append(no[0].split(".")[-1])
            else:
                info.append("Failed to retrieve")
            runtime = html.xpath("//span[@property='v:runtime']/text()")
            if len(runtime) == 1:
                info.append(runtime[0].split("minutes")[0])
            else:
                info.append("Failed to retrieve")
            self.db_insert(info)

    def db_insert(self, info_l):
        sql_insert_detection = """
            insert ignore into `douban_top250`.`movies_info` (`Movie Name`,`Director`,`Starring Information`,`Genre`,`Release Date`,`Plot Summary`,`Ranking`,`Duration`)
            values ("{l[0]}","{l[1]}","{l[2]}","{l[3]}","{l[4]}","{l[5]}","{l[6]}","{l[7]}");
        """.format(l=info_l)
        try:
            self.cursor.execute(sql_insert_detection)
            self.mysql_conn.commit()
        except Exception as e:
            self.mysql_conn.rollback()
            print("Error while importing data to the database:{}".format(e))
            exit(0)
        else:
            print("{} information retrieved successfully...\n".format(info_l[0]))

    def transferContent(self, content):
        # Escape strings that contain both ' and "
        if content is None:
            return None
        else:
            string = ""
            for c in content:
                if c == '"':
                    string += '\\\"'
                elif c == "'":
                    string += "\\\'"
                elif c == "\\":
                    string += "\\\\"
                else:
                    string += c
            return string


if __name__ == "__main__":
    Top250()

The demonstration gif is as follows:

image

Reference articles:

Loading...
Ownership of this post data is guaranteed by blockchain and smart contracts to the creator alone.