mirror of
https://github.com/zebrajr/imdbscrapper.git
synced 2025-12-06 00:20:21 +01:00
Updated: MariaDB as Back Completed
This commit is contained in:
parent
96e29fd448
commit
6419ba97a7
237
src/back/imdbscrapper.sql
Normal file
237
src/back/imdbscrapper.sql
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
-- phpMyAdmin SQL Dump
|
||||
-- version 5.0.1
|
||||
-- https://www.phpmyadmin.net/
|
||||
--
|
||||
-- Host: imdbdb
|
||||
-- Generation Time: Jul 29, 2021 at 02:26 PM
|
||||
-- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
|
||||
-- PHP Version: 7.4.1
|
||||
|
||||
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
|
||||
SET AUTOCOMMIT = 0;
|
||||
START TRANSACTION;
|
||||
SET time_zone = "+00:00";
|
||||
|
||||
|
||||
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
||||
/*!40101 SET NAMES utf8mb4 */;
|
||||
|
||||
--
|
||||
-- Database: `imdbscrapper`
|
||||
--
|
||||
CREATE DATABASE IF NOT EXISTS `imdbscrapper` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
|
||||
USE `imdbscrapper`;
|
||||
|
||||
DELIMITER $$
|
||||
--
|
||||
-- Procedures
|
||||
--
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertMovie` (`idMovie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE) BEGIN
|
||||
INSERT INTO movies
|
||||
(`idmovie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
|
||||
VALUES(idMovie, name, description, imdbURL, rating, ratingCount, releaseDate);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertMovieGenre` (IN `idMovie` BIGINT(20), IN `idGenre` VARCHAR(255)) BEGIN
|
||||
INSERT INTO moviesGenre
|
||||
(`idMovie`, `idGenre`)
|
||||
VALUES(idMovie, idGenre);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertRecheck` (`inNumber` BIGINT(20)) BEGIN
|
||||
INSERT INTO recheck
|
||||
(`number`)
|
||||
VALUES(inNumber);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertSerie` (`idSerie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE) BEGIN
|
||||
INSERT INTO series
|
||||
(`idserie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
|
||||
VALUES(idSerie, name, description, imdbURL, rating, ratingCount, releaseDate);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20), IN `idGenre` VARCHAR(255)) BEGIN
|
||||
INSERT INTO seriesGenre
|
||||
(`idSerie`, `idGenre`)
|
||||
VALUES(idSerie, idGenre);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255)) BEGIN
|
||||
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
|
||||
FROM movies
|
||||
WHERE movies.name = movieName;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255)) BEGIN
|
||||
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
|
||||
FROM series
|
||||
WHERE series.name = serieName;
|
||||
END$$
|
||||
|
||||
DELIMITER ;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `genres`
|
||||
--
|
||||
|
||||
CREATE TABLE `genres` (
|
||||
`id` bigint(20) NOT NULL,
|
||||
`name` varchar(255) NOT NULL,
|
||||
`description` longtext DEFAULT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `movies`
|
||||
--
|
||||
|
||||
CREATE TABLE `movies` (
|
||||
`idMovie` bigint(20) NOT NULL,
|
||||
`name` varchar(255) NOT NULL,
|
||||
`description` longtext DEFAULT NULL,
|
||||
`imdbURL` varchar(255) NOT NULL,
|
||||
`rating` double DEFAULT NULL,
|
||||
`ratingCount` bigint(20) DEFAULT NULL,
|
||||
`releaseDate` date DEFAULT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `moviesGenre`
|
||||
--
|
||||
|
||||
CREATE TABLE `moviesGenre` (
|
||||
`id` bigint(20) NOT NULL,
|
||||
`idMovie` bigint(20) NOT NULL,
|
||||
`idGenre` varchar(255) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `recheck`
|
||||
--
|
||||
|
||||
CREATE TABLE `recheck` (
|
||||
`id` bigint(20) NOT NULL,
|
||||
`number` bigint(20) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `series`
|
||||
--
|
||||
|
||||
CREATE TABLE `series` (
|
||||
`idSerie` bigint(20) NOT NULL,
|
||||
`name` varchar(255) NOT NULL,
|
||||
`description` longtext DEFAULT NULL,
|
||||
`imdbURL` varchar(255) NOT NULL,
|
||||
`rating` double DEFAULT NULL,
|
||||
`ratingCount` bigint(20) DEFAULT NULL,
|
||||
`releaseDate` date DEFAULT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `seriesGenre`
|
||||
--
|
||||
|
||||
CREATE TABLE `seriesGenre` (
|
||||
`id` bigint(20) NOT NULL,
|
||||
`idSerie` bigint(20) NOT NULL,
|
||||
`idGenre` varchar(255) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
--
|
||||
-- Indexes for dumped tables
|
||||
--
|
||||
|
||||
--
|
||||
-- Indexes for table `genres`
|
||||
--
|
||||
ALTER TABLE `genres`
|
||||
ADD PRIMARY KEY (`id`);
|
||||
|
||||
--
|
||||
-- Indexes for table `movies`
|
||||
--
|
||||
ALTER TABLE `movies`
|
||||
ADD PRIMARY KEY (`idMovie`);
|
||||
|
||||
--
|
||||
-- Indexes for table `moviesGenre`
|
||||
--
|
||||
ALTER TABLE `moviesGenre`
|
||||
ADD PRIMARY KEY (`id`);
|
||||
|
||||
--
|
||||
-- Indexes for table `recheck`
|
||||
--
|
||||
ALTER TABLE `recheck`
|
||||
ADD PRIMARY KEY (`id`);
|
||||
|
||||
--
|
||||
-- Indexes for table `series`
|
||||
--
|
||||
ALTER TABLE `series`
|
||||
ADD PRIMARY KEY (`idSerie`);
|
||||
|
||||
--
|
||||
-- Indexes for table `seriesGenre`
|
||||
--
|
||||
ALTER TABLE `seriesGenre`
|
||||
ADD PRIMARY KEY (`id`);
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for dumped tables
|
||||
--
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `genres`
|
||||
--
|
||||
ALTER TABLE `genres`
|
||||
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `movies`
|
||||
--
|
||||
ALTER TABLE `movies`
|
||||
MODIFY `idMovie` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `moviesGenre`
|
||||
--
|
||||
ALTER TABLE `moviesGenre`
|
||||
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `recheck`
|
||||
--
|
||||
ALTER TABLE `recheck`
|
||||
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `series`
|
||||
--
|
||||
ALTER TABLE `series`
|
||||
MODIFY `idSerie` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
|
||||
--
|
||||
-- AUTO_INCREMENT for table `seriesGenre`
|
||||
--
|
||||
ALTER TABLE `seriesGenre`
|
||||
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
|
||||
COMMIT;
|
||||
|
||||
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||
|
|
@ -5,7 +5,7 @@ import json
|
|||
import requests
|
||||
import csv
|
||||
import logging
|
||||
import mysql.connector
|
||||
import mysql.connector as mariadb
|
||||
from multiprocessing import Process
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
|
@ -39,6 +39,68 @@ def saveToFile(dataTable, dataPath):
|
|||
except Exception as e:
|
||||
print("Retrying updating - %s - %s" % (dataPath, e))
|
||||
|
||||
def checkForDuplicate(idCheck):
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
# [TODO] Should be changed to one procedure call only
|
||||
cursor.callproc('checkDuplicateMovie', [idCheck,])
|
||||
for results in cursor.stored_results():
|
||||
result = results.fetchall()
|
||||
commitDBConnection(mydb)
|
||||
if len(result) > 0:
|
||||
return False
|
||||
|
||||
|
||||
# Creates and returns a mariadb connection object
|
||||
def createDBConnection():
|
||||
mydb = mariadb.connect(
|
||||
host = 'imdbdb',
|
||||
user = 'root',
|
||||
password = 'secret',
|
||||
database = 'imdbscrapper'
|
||||
)
|
||||
return mydb
|
||||
|
||||
def commitDBConnection(database):
|
||||
database.commit()
|
||||
database.close()
|
||||
|
||||
|
||||
def saveToDatabase(dataTable, inTable):
|
||||
# [TODO] Change to dynamic values from docker-compose.yml
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
|
||||
# row[0] idMovie
|
||||
# row[1] name
|
||||
# row[2] description
|
||||
# row[3] url
|
||||
# row[4] genres
|
||||
# row[5] rating
|
||||
# row[6] ratingCount
|
||||
# row[7] releaseDate
|
||||
|
||||
|
||||
# Defines which procedures to call
|
||||
if (inTable == 'movies'):
|
||||
mainTable = 'insertMovie'
|
||||
genreTable = 'insertMovieGenre'
|
||||
if (inTable == 'series'):
|
||||
mainTable = 'insertSerie'
|
||||
genreTable = 'insertSerieGenre'
|
||||
|
||||
for row in dataTable:
|
||||
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
|
||||
try:
|
||||
if len(row[4]) > 1:
|
||||
for genre in row[4]:
|
||||
cursor.callproc(genreTable, [row[0],genre,])
|
||||
continue
|
||||
except Exception as e:
|
||||
cursor.callproc(genreTable, [row[0],str(row[4]),])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
'''
|
||||
Main Function for the scrapper
|
||||
It will prepare the URL, request it, get the answer, parse the information to a list,
|
||||
|
|
@ -47,11 +109,12 @@ def saveToFile(dataTable, dataPath):
|
|||
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
|
||||
'''
|
||||
def imdbscrapper(startURL, endURL):
|
||||
|
||||
# Configuration values for the scrapper
|
||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||
logFile = "/opt/storage/info.log" # Log output
|
||||
#counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
#logFile = "/opt/storage/info.log" # Log output
|
||||
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
|
||||
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
|
||||
|
|
@ -69,7 +132,17 @@ def imdbscrapper(startURL, endURL):
|
|||
dataRow = [] # Initializes the dataRow list
|
||||
errorRow = [] # Initializes the errorRow list
|
||||
reCheckRow = [] # Initializes the reCheckRow list
|
||||
|
||||
# Assume Non Duplicate
|
||||
duplicateTest = True
|
||||
# Test for Duplicate
|
||||
duplicateTest = checkForDuplicate(titleFixed)
|
||||
|
||||
# If a duplicate is found, skip number
|
||||
if duplicateTest is False:
|
||||
continue
|
||||
try:
|
||||
dataRow.append(titleFixed)
|
||||
# Requests, parses and loads into JSON the HTML response
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
|
@ -89,19 +162,19 @@ def imdbscrapper(startURL, endURL):
|
|||
try:
|
||||
dataRow.append(data['genre'])
|
||||
except Exception as e:
|
||||
dataRow.append("Genre Unknown")
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingValue'])
|
||||
except Exception as e:
|
||||
dataRow.append("No Rating")
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingCount'])
|
||||
except Exception as e:
|
||||
dataRow.append("Total Rating Count N/A")
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['datePublished'])
|
||||
except Exception as e:
|
||||
dataRow.append("Unknown")
|
||||
dataRow.append('1000-01-01')
|
||||
|
||||
# Checks if its a movie or a serie/show, and append the list to the list of lists
|
||||
if(data['@type'] == 'Movie'):
|
||||
|
|
@ -119,13 +192,18 @@ def imdbscrapper(startURL, endURL):
|
|||
reCheckRow.append(recheckString)
|
||||
reCheckTable.append(reCheckRow)
|
||||
# Writes the list of lists to each correct file
|
||||
saveToFile(movieTable, moviesFile)
|
||||
saveToFile(serieTable, seriesFile)
|
||||
saveToFile(errorTable, logFile)
|
||||
saveToFile(reCheckTable, reCheckFile)
|
||||
#saveToFile(movieTable, moviesFile)
|
||||
#saveToFile(serieTable, seriesFile)
|
||||
#saveToFile(errorTable, logFile)
|
||||
#saveToFile(reCheckTable, reCheckFile)
|
||||
saveToDatabase(movieTable, 'movies')
|
||||
saveToDatabase(serieTable, 'series')
|
||||
|
||||
def main():
|
||||
cls()
|
||||
|
||||
#imdbscrapper(903747,903743)
|
||||
|
||||
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
|
||||
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
|
||||
endURL = int(os.getenv('END_URL', 0)) # Ending Number
|
||||
|
|
@ -157,5 +235,6 @@ def main():
|
|||
process.join()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user