Updated: MariaDB as Back Completed

2025-12-06 00:20:21 +01:00 · 2021-07-29 17:42:13 +02:00 · 2021-07-29 17:42:13 +02:00 · 6419ba97a7
commit 6419ba97a7
parent 96e29fd448
2 changed files with 327 additions and 11 deletions
--- a/src/back/imdbscrapper.sql
+++ b/src/back/imdbscrapper.sql
@ -0,0 +1,237 @@
+-- phpMyAdmin SQL Dump
+-- version 5.0.1
+-- https://www.phpmyadmin.net/
+--
+-- Host: imdbdb
+-- Generation Time: Jul 29, 2021 at 02:26 PM
+-- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
+-- PHP Version: 7.4.1
+
+SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
+SET AUTOCOMMIT = 0;
+START TRANSACTION;
+SET time_zone = "+00:00";
+
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8mb4 */;
+
+--
+-- Database: `imdbscrapper`
+--
+CREATE DATABASE IF NOT EXISTS `imdbscrapper` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
+USE `imdbscrapper`;
+
+DELIMITER $$
+--
+-- Procedures
+--
+CREATE DEFINER=`root`@`%` PROCEDURE `insertMovie` (`idMovie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE)  BEGIN
+	INSERT INTO movies
+    (`idmovie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
+    VALUES(idMovie, name, description, imdbURL, rating, ratingCount, releaseDate);
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `insertMovieGenre` (IN `idMovie` BIGINT(20), IN `idGenre` VARCHAR(255))  BEGIN
+	INSERT INTO moviesGenre
+    (`idMovie`, `idGenre`)
+    VALUES(idMovie, idGenre);
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `insertRecheck` (`inNumber` BIGINT(20))  BEGIN
+	INSERT INTO recheck
+    (`number`)
+    VALUES(inNumber);
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `insertSerie` (`idSerie` BIGINT(20), `name` VARCHAR(255), `description` LONGTEXT, `imdbURL` VARCHAR(255), `rating` DOUBLE, `ratingCount` BIGINT(20), `releaseDate` DATE)  BEGIN
+	INSERT INTO series
+    (`idserie`, `name`, `description`, `imdbURL`, `rating`, `ratingCount`, `releaseDate`)
+    VALUES(idSerie, name, description, imdbURL, rating, ratingCount, releaseDate);
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20), IN `idGenre` VARCHAR(255))  BEGIN
+	INSERT INTO seriesGenre
+    (`idSerie`, `idGenre`)
+    VALUES(idSerie, idGenre);
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255))  BEGIN
+	SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
+    FROM movies
+    WHERE movies.name = movieName;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255))  BEGIN
+	SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
+    FROM series
+    WHERE series.name = serieName;
+END$$
+
+DELIMITER ;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `genres`
+--
+
+CREATE TABLE `genres` (
+  `id` bigint(20) NOT NULL,
+  `name` varchar(255) NOT NULL,
+  `description` longtext DEFAULT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `movies`
+--
+
+CREATE TABLE `movies` (
+  `idMovie` bigint(20) NOT NULL,
+  `name` varchar(255) NOT NULL,
+  `description` longtext DEFAULT NULL,
+  `imdbURL` varchar(255) NOT NULL,
+  `rating` double DEFAULT NULL,
+  `ratingCount` bigint(20) DEFAULT NULL,
+  `releaseDate` date DEFAULT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `moviesGenre`
+--
+
+CREATE TABLE `moviesGenre` (
+  `id` bigint(20) NOT NULL,
+  `idMovie` bigint(20) NOT NULL,
+  `idGenre` varchar(255) NOT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `recheck`
+--
+
+CREATE TABLE `recheck` (
+  `id` bigint(20) NOT NULL,
+  `number` bigint(20) NOT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `series`
+--
+
+CREATE TABLE `series` (
+  `idSerie` bigint(20) NOT NULL,
+  `name` varchar(255) NOT NULL,
+  `description` longtext DEFAULT NULL,
+  `imdbURL` varchar(255) NOT NULL,
+  `rating` double DEFAULT NULL,
+  `ratingCount` bigint(20) DEFAULT NULL,
+  `releaseDate` date DEFAULT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `seriesGenre`
+--
+
+CREATE TABLE `seriesGenre` (
+  `id` bigint(20) NOT NULL,
+  `idSerie` bigint(20) NOT NULL,
+  `idGenre` varchar(255) NOT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+--
+-- Indexes for dumped tables
+--
+
+--
+-- Indexes for table `genres`
+--
+ALTER TABLE `genres`
+  ADD PRIMARY KEY (`id`);
+
+--
+-- Indexes for table `movies`
+--
+ALTER TABLE `movies`
+  ADD PRIMARY KEY (`idMovie`);
+
+--
+-- Indexes for table `moviesGenre`
+--
+ALTER TABLE `moviesGenre`
+  ADD PRIMARY KEY (`id`);
+
+--
+-- Indexes for table `recheck`
+--
+ALTER TABLE `recheck`
+  ADD PRIMARY KEY (`id`);
+
+--
+-- Indexes for table `series`
+--
+ALTER TABLE `series`
+  ADD PRIMARY KEY (`idSerie`);
+
+--
+-- Indexes for table `seriesGenre`
+--
+ALTER TABLE `seriesGenre`
+  ADD PRIMARY KEY (`id`);
+
+--
+-- AUTO_INCREMENT for dumped tables
+--
+
+--
+-- AUTO_INCREMENT for table `genres`
+--
+ALTER TABLE `genres`
+  MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `movies`
+--
+ALTER TABLE `movies`
+  MODIFY `idMovie` bigint(20) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `moviesGenre`
+--
+ALTER TABLE `moviesGenre`
+  MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `recheck`
+--
+ALTER TABLE `recheck`
+  MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `series`
+--
+ALTER TABLE `series`
+  MODIFY `idSerie` bigint(20) NOT NULL AUTO_INCREMENT;
+
+--
+-- AUTO_INCREMENT for table `seriesGenre`
+--
+ALTER TABLE `seriesGenre`
+  MODIFY `id` bigint(20) NOT NULL AUTO_INCREMENT;
+COMMIT;
+
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
--- a/src/scrapper/scrapper.py
+++ b/src/scrapper/scrapper.py
@ -5,7 +5,7 @@ import json
 import requests
 import csv
 import logging
-import mysql.connector
+import mysql.connector as mariadb
 from multiprocessing import Process
 from bs4 import BeautifulSoup

@ -39,6 +39,68 @@ def saveToFile(dataTable, dataPath):
        except Exception as e:
            print("Retrying updating - %s - %s" % (dataPath, e))

+def checkForDuplicate(idCheck):
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+    # [TODO] Should be changed to one procedure call only
+    cursor.callproc('checkDuplicateMovie', [idCheck,])
+    for results in cursor.stored_results():
+        result = results.fetchall()
+    commitDBConnection(mydb)
+    if len(result) > 0:
+        return False
+
+
+# Creates and returns a mariadb connection object
+def createDBConnection():
+    mydb = mariadb.connect(
+        host = 'imdbdb',
+        user = 'root',
+        password = 'secret',
+        database = 'imdbscrapper'
+    )
+    return mydb
+
+def commitDBConnection(database):
+    database.commit()
+    database.close()
+
+
+def saveToDatabase(dataTable, inTable):
+    # [TODO] Change to dynamic values from docker-compose.yml
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+
+    # row[0] idMovie
+    # row[1] name
+    # row[2] description
+    # row[3] url
+    # row[4] genres
+    # row[5] rating
+    # row[6] ratingCount
+    # row[7] releaseDate
+
+
+    # Defines which procedures to call
+    if (inTable == 'movies'):
+        mainTable = 'insertMovie'
+        genreTable = 'insertMovieGenre'
+    if (inTable == 'series'):
+        mainTable = 'insertSerie'
+        genreTable = 'insertSerieGenre'
+
+    for row in dataTable:
+        cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
+        try:
+            if len(row[4]) > 1:
+                for genre in row[4]:
+                    cursor.callproc(genreTable, [row[0],genre,])
+                continue
+        except Exception as e:
+            cursor.callproc(genreTable, [row[0],str(row[4]),])
+    commitDBConnection(mydb)
+
+
 '''
    Main Function for the scrapper
    It will prepare the URL, request it, get the answer, parse the information to a list,
@ -47,11 +109,12 @@ def saveToFile(dataTable, dataPath):
    Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
 '''
 def imdbscrapper(startURL, endURL):
+
    # Configuration values for the scrapper
    baseURL         = "https://www.imdb.com/title/tt"       # Base URL for each title
    debugLevel      = 40                                    # 20 will display Info messages, 40 errors
-    logFile         = "/opt/storage/info.log"               # Log output
-    #counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
+    #logFile         = "/opt/storage/info.log"               # Log output
+    counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
    reCheckFile     = "/opt/storage/recheck.txt"            # Which IDs to recheck
    moviesFile      = "/opt/storage/movies.csv"             # Where to store movie info
    seriesFile      = "/opt/storage/series.csv"             # Where to store shows/series info
@ -69,7 +132,17 @@ def imdbscrapper(startURL, endURL):
        dataRow    = []                            # Initializes the dataRow list
        errorRow   = []                            # Initializes the errorRow list
        reCheckRow = []                            # Initializes the reCheckRow list
+
+        # Assume Non Duplicate
+        duplicateTest = True
+        # Test for Duplicate
+        duplicateTest = checkForDuplicate(titleFixed)
+
+        # If a duplicate is found, skip number
+        if duplicateTest is False:
+            continue
        try:
+            dataRow.append(titleFixed)
            # Requests, parses and loads into JSON the HTML response
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
@ -89,19 +162,19 @@ def imdbscrapper(startURL, endURL):
            try:
                dataRow.append(data['genre'])
            except Exception as e:
-                dataRow.append("Genre Unknown")
+                dataRow.append(0)
            try:
                dataRow.append(data['aggregateRating']['ratingValue'])
            except Exception as e:
-                dataRow.append("No Rating")
+                dataRow.append(0)
            try:
                dataRow.append(data['aggregateRating']['ratingCount'])
            except Exception as e:
-                dataRow.append("Total Rating Count N/A")
+                dataRow.append(0)
            try:
                dataRow.append(data['datePublished'])
            except Exception as e:
-                dataRow.append("Unknown")
+                dataRow.append('1000-01-01')

            # Checks if its a movie or a serie/show, and append the list to the list of lists
            if(data['@type'] == 'Movie'):
@ -119,13 +192,18 @@ def imdbscrapper(startURL, endURL):
                reCheckRow.append(recheckString)
                reCheckTable.append(reCheckRow)
    # Writes the list of lists to each correct file
-    saveToFile(movieTable, moviesFile)
-    saveToFile(serieTable, seriesFile)
-    saveToFile(errorTable, logFile)
-    saveToFile(reCheckTable, reCheckFile)
+    #saveToFile(movieTable, moviesFile)
+    #saveToFile(serieTable, seriesFile)
+    #saveToFile(errorTable, logFile)
+    #saveToFile(reCheckTable, reCheckFile)
+    saveToDatabase(movieTable, 'movies')
+    saveToDatabase(serieTable, 'series')

 def main():
    cls()
+
+    #imdbscrapper(903747,903743)
+
    nrProcesses     = int(os.getenv('PROCESSES', 5))         # Number of Processes to start in parallel
    startURL        = int(os.getenv('START_URL', 10000000))  # Starting Number
    endURL          = int(os.getenv('END_URL', 0))           # Ending Number
@ -157,5 +235,6 @@ def main():
            process.join()


+
 if __name__ == "__main__":
    main()