Updated: MariaDB as Back Completed

This commit is contained in:
Carlos Sousa 2021-07-29 17:42:13 +02:00
parent 96e29fd448
commit 6419ba97a7
2 changed files with 327 additions and 11 deletions

237
src/back/imdbscrapper.sql Normal file
View File

@ -0,0 +1,237 @@
-- phpMyAdmin SQL Dump
-- version 5.0.1
-- https://www.phpmyadmin.net/
--
-- Host: imdbdb
-- Generation Time: Jul 29, 2021 at 02:26 PM
-- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
-- PHP Version: 7.4.1
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET AUTOCOMMIT = 0;
START TRANSACTION;
SET time_zone = "+00:00";
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;
--
-- Database: `imdbscrapper`
--
CREATE DATABASE IF NOT EXISTS `imdbscrapper` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
USE `imdbscrapper`;
DELIMITER $$
--
-- Procedures
--
CREATE DEFINER=`root`@`%` PROCEDURE `insertMovie` (`idMovie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE) BEGIN
INSERT INTO movies
(`idmovie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
VALUES(idMovie, name, description, imdbURL, rating, ratingCount, releaseDate);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `insertMovieGenre` (IN `idMovie` BIGINT(20), IN `idGenre` VARCHAR(255)) BEGIN
INSERT INTO moviesGenre
(`idMovie`, `idGenre`)
VALUES(idMovie, idGenre);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `insertRecheck` (`inNumber` BIGINT(20)) BEGIN
INSERT INTO recheck
(`number`)
VALUES(inNumber);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `insertSerie` (`idSerie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE) BEGIN
INSERT INTO series
(`idserie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
VALUES(idSerie, name, description, imdbURL, rating, ratingCount, releaseDate);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20), IN `idGenre` VARCHAR(255)) BEGIN
INSERT INTO seriesGenre
(`idSerie`, `idGenre`)
VALUES(idSerie, idGenre);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255)) BEGIN
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
FROM movies
WHERE movies.name = movieName;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255)) BEGIN
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
FROM series
WHERE series.name = serieName;
END$$
DELIMITER ;
-- --------------------------------------------------------
--
-- Table structure for table `genres`
--
CREATE TABLE `genres` (
`id` bigint(20) NOT NULL,
`name` varchar(255) NOT NULL,
`description` longtext DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `movies`
--
CREATE TABLE `movies` (
`idMovie` bigint(20) NOT NULL,
`name` varchar(255) NOT NULL,
`description` longtext DEFAULT NULL,
`imdbURL` varchar(255) NOT NULL,
`rating` double DEFAULT NULL,
`ratingCount` bigint(20) DEFAULT NULL,
`releaseDate` date DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `moviesGenre`
--
CREATE TABLE `moviesGenre` (
`id` bigint(20) NOT NULL,
`idMovie` bigint(20) NOT NULL,
`idGenre` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `recheck`
--
CREATE TABLE `recheck` (
`id` bigint(20) NOT NULL,
`number` bigint(20) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `series`
--
CREATE TABLE `series` (
`idSerie` bigint(20) NOT NULL,
`name` varchar(255) NOT NULL,
`description` longtext DEFAULT NULL,
`imdbURL` varchar(255) NOT NULL,
`rating` double DEFAULT NULL,
`ratingCount` bigint(20) DEFAULT NULL,
`releaseDate` date DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `seriesGenre`
--
CREATE TABLE `seriesGenre` (
`id` bigint(20) NOT NULL,
`idSerie` bigint(20) NOT NULL,
`idGenre` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
--
-- Indexes for dumped tables
--
--
-- Indexes for table `genres`
--
ALTER TABLE `genres`
ADD PRIMARY KEY (`id`);
--
-- Indexes for table `movies`
--
ALTER TABLE `movies`
ADD PRIMARY KEY (`idMovie`);
--
-- Indexes for table `moviesGenre`
--
ALTER TABLE `moviesGenre`
ADD PRIMARY KEY (`id`);
--
-- Indexes for table `recheck`
--
ALTER TABLE `recheck`
ADD PRIMARY KEY (`id`);
--
-- Indexes for table `series`
--
ALTER TABLE `series`
ADD PRIMARY KEY (`idSerie`);
--
-- Indexes for table `seriesGenre`
--
ALTER TABLE `seriesGenre`
ADD PRIMARY KEY (`id`);
--
-- AUTO_INCREMENT for dumped tables
--
--
-- AUTO_INCREMENT for table `genres`
--
ALTER TABLE `genres`
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
--
-- AUTO_INCREMENT for table `movies`
--
ALTER TABLE `movies`
MODIFY `idMovie` bigint(20) NOT NULL AUTO_INCREMENT;
--
-- AUTO_INCREMENT for table `moviesGenre`
--
ALTER TABLE `moviesGenre`
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
--
-- AUTO_INCREMENT for table `recheck`
--
ALTER TABLE `recheck`
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
--
-- AUTO_INCREMENT for table `series`
--
ALTER TABLE `series`
MODIFY `idSerie` bigint(20) NOT NULL AUTO_INCREMENT;
--
-- AUTO_INCREMENT for table `seriesGenre`
--
ALTER TABLE `seriesGenre`
MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
COMMIT;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

View File

@ -5,7 +5,7 @@ import json
import requests
import csv
import logging
import mysql.connector
import mysql.connector as mariadb
from multiprocessing import Process
from bs4 import BeautifulSoup
@ -39,6 +39,68 @@ def saveToFile(dataTable, dataPath):
except Exception as e:
print("Retrying updating - %s - %s" % (dataPath, e))
def checkForDuplicate(idCheck):
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
# [TODO] Should be changed to one procedure call only
cursor.callproc('checkDuplicateMovie', [idCheck,])
for results in cursor.stored_results():
result = results.fetchall()
commitDBConnection(mydb)
if len(result) > 0:
return False
# Creates and returns a mariadb connection object
def createDBConnection():
mydb = mariadb.connect(
host = 'imdbdb',
user = 'root',
password = 'secret',
database = 'imdbscrapper'
)
return mydb
def commitDBConnection(database):
database.commit()
database.close()
def saveToDatabase(dataTable, inTable):
# [TODO] Change to dynamic values from docker-compose.yml
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
# row[0] idMovie
# row[1] name
# row[2] description
# row[3] url
# row[4] genres
# row[5] rating
# row[6] ratingCount
# row[7] releaseDate
# Defines which procedures to call
if (inTable == 'movies'):
mainTable = 'insertMovie'
genreTable = 'insertMovieGenre'
if (inTable == 'series'):
mainTable = 'insertSerie'
genreTable = 'insertSerieGenre'
for row in dataTable:
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
try:
if len(row[4]) > 1:
for genre in row[4]:
cursor.callproc(genreTable, [row[0],genre,])
continue
except Exception as e:
cursor.callproc(genreTable, [row[0],str(row[4]),])
commitDBConnection(mydb)
'''
Main Function for the scrapper
It will prepare the URL, request it, get the answer, parse the information to a list,
@ -47,11 +109,12 @@ def saveToFile(dataTable, dataPath):
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
'''
def imdbscrapper(startURL, endURL):
# Configuration values for the scrapper
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
debugLevel = 40 # 20 will display Info messages, 40 errors
logFile = "/opt/storage/info.log" # Log output
#counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
#logFile = "/opt/storage/info.log" # Log output
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
@ -69,7 +132,17 @@ def imdbscrapper(startURL, endURL):
dataRow = [] # Initializes the dataRow list
errorRow = [] # Initializes the errorRow list
reCheckRow = [] # Initializes the reCheckRow list
# Assume Non Duplicate
duplicateTest = True
# Test for Duplicate
duplicateTest = checkForDuplicate(titleFixed)
# If a duplicate is found, skip number
if duplicateTest is False:
continue
try:
dataRow.append(titleFixed)
# Requests, parses and loads into JSON the HTML response
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
@ -89,19 +162,19 @@ def imdbscrapper(startURL, endURL):
try:
dataRow.append(data['genre'])
except Exception as e:
dataRow.append("Genre Unknown")
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingValue'])
except Exception as e:
dataRow.append("No Rating")
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingCount'])
except Exception as e:
dataRow.append("Total Rating Count N/A")
dataRow.append(0)
try:
dataRow.append(data['datePublished'])
except Exception as e:
dataRow.append("Unknown")
dataRow.append('1000-01-01')
# Checks if its a movie or a serie/show, and append the list to the list of lists
if(data['@type'] == 'Movie'):
@ -119,13 +192,18 @@ def imdbscrapper(startURL, endURL):
reCheckRow.append(recheckString)
reCheckTable.append(reCheckRow)
# Writes the list of lists to each correct file
saveToFile(movieTable, moviesFile)
saveToFile(serieTable, seriesFile)
saveToFile(errorTable, logFile)
saveToFile(reCheckTable, reCheckFile)
#saveToFile(movieTable, moviesFile)
#saveToFile(serieTable, seriesFile)
#saveToFile(errorTable, logFile)
#saveToFile(reCheckTable, reCheckFile)
saveToDatabase(movieTable, 'movies')
saveToDatabase(serieTable, 'series')
def main():
cls()
#imdbscrapper(903747,903743)
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
endURL = int(os.getenv('END_URL', 0)) # Ending Number
@ -157,5 +235,6 @@ def main():
process.join()
if __name__ == "__main__":
main()