From d7249753594e97a71d9e64dd26fc363cf928a1e9 Mon Sep 17 00:00:00 2001 From: NSawyer <33187059+GShadow5@users.noreply.github.com> Date: Thu, 8 Feb 2024 12:53:40 -0500 Subject: [PATCH] Initial commit --- .gitattributes | 2 + .gitignore | 14 +++++ .vscode/settings.json | 5 ++ Cargo.toml | 35 +++++++++++ LICENSE | 21 +++++++ README.md | 2 + config.json | 7 +++ log/my.log | 0 notes.txt | 14 +++++ src/argparse.rs | 108 +++++++++++++++++++++++++++++++ src/download.rs | 36 +++++++++++ src/fileio.rs | 137 ++++++++++++++++++++++++++++++++++++++++ src/image_data.rs | 41 ++++++++++++ src/json_code.rs | 107 +++++++++++++++++++++++++++++++ src/logging.rs | 56 +++++++++++++++++ src/main.rs | 82 ++++++++++++++++++++++++ src/percentage.rs | 69 ++++++++++++++++++++ src/scan.rs | 53 ++++++++++++++++ src/web.rs | 84 +++++++++++++++++++++++++ src/web/fetch.rs | 74 ++++++++++++++++++++++ src/web/html.rs | 143 ++++++++++++++++++++++++++++++++++++++++++ 21 files changed, 1090 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config.json create mode 100644 log/my.log create mode 100644 notes.txt create mode 100644 src/argparse.rs create mode 100644 src/download.rs create mode 100644 src/fileio.rs create mode 100644 src/image_data.rs create mode 100644 src/json_code.rs create mode 100644 src/logging.rs create mode 100644 src/main.rs create mode 100644 src/percentage.rs create mode 100644 src/scan.rs create mode 100644 src/web.rs create mode 100644 src/web/fetch.rs create mode 100644 src/web/html.rs diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6985cf1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ec17418 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "rust-analyzer.linkedProjects": [ + ".\\Cargo.toml" + ] +} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d46d66c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,35 @@ +[package] +author = "Nayan Sawyer" +name = "w10s_webscraper" +version = "0.1.0" +description = "----\n\nAn example webscraper\nfolder defaults to current directory" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = { version = "0.4.26", features = ["clock"] } +clap = { version = "4.3.9", features = ["derive"] } +env_logger = "0.10.0" +filetime_creation = "0.1.5" +function_name = "0.3.0" +hex-literal = "0.4.1" +json = "0.12.4" +lazy_static = "1.4.0" +log = "0.4.19" +log4rs = "1.2.0" +md5 = "0.7.0" +reqwest = { version = "0.11.18", features = ["blocking"] } +scraper = "0.16.0" +serde-value = "0.7.0" + +[profile.dev] +opt-level = 0 +debug = true +debug-assertions = true +overflow-checks = true +lto = false +panic = 'unwind' +incremental = true +codegen-units = 256 +rpath = false diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1d0f42a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 NSawyer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..48cd680 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# webscraper_clean + An example webscraper written in Rust diff --git a/config.json b/config.json new file mode 100644 index 0000000..9c2fb84 --- /dev/null +++ b/config.json @@ -0,0 +1,7 @@ +{ + "download": true, + "url": "", + "scan": true, + "image_directory": "images", + "test": true +} \ No newline at end of file diff --git a/log/my.log b/log/my.log new file mode 100644 index 0000000..e69de29 diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..d23e859 --- /dev/null +++ b/notes.txt @@ -0,0 +1,14 @@ +arguments + +debug +target directory +scan directory +-vertical + + +File format +------------ +name: +hash: +date_added: +blacklist: \ No newline at end of file diff --git a/src/argparse.rs b/src/argparse.rs new file mode 100644 index 0000000..b7a758c --- /dev/null +++ b/src/argparse.rs @@ -0,0 +1,108 @@ +use crate::{json_code, named}; +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[derive(Debug)] +pub struct Cli { + #[arg(short, long, value_name = "FILE")] + pub folder: Option, + + #[arg(short, long, value_name = "URL")] + pub url: Option, + + #[arg(short, long)] + pub scan: bool, + + // Turn debugging information on + #[arg(short, long, action = clap::ArgAction::Count)] + pub verbosity: u8, + + #[arg(short, long)] + pub download: bool, + + // Turn testing mode on + #[arg(short, long)] + pub test: bool, + + #[arg(short, long)] + pub create_config_file: bool, +} + +#[derive(Debug)] +pub struct Config { + pub verbosity: u8, + pub url: String, + pub scan: bool, + pub image_directory: PathBuf, + pub download: bool, + pub test: bool, +} + +#[named] +pub fn parse_args() -> Config { + // If config file is present, read it and use it to override the command line arguments + if let Ok(json) = std::fs::read_to_string("config.json") { + let json: json::JsonValue = match json::parse(&json) { + Ok(json) => json, + Err(e) => { + println!("fn {} - Error parsing config.json: {}", function_name!(), e); + std::process::exit(1); + } + }; + let config = json_code::parse_config_json(json); + return config; + } + // Parse command line arguments + let cli = Cli::parse(); + let mut config = Config { + download: false, + url: String::from(""), + scan: false, + image_directory: PathBuf::from("images"), + test: false, + verbosity: 0, + }; + + if cli.scan == false && cli.download == false { + println!( + "fn {} - You must specify either --scan, --download, or --help", + function_name!() + ); + std::process::exit(1); + } + if let Some(path) = cli.folder.as_deref() { + config.image_directory = path.to_path_buf(); + } + if cli.scan { + config.scan = cli.scan; + } + if let Some(url) = cli.url.as_deref() { + config.url = url.to_string(); + } + if cli.test { + config.test = cli.test; + } + + if cli.download { + config.download = cli.download; + } + + // If the image directory is the default, create it if it doesn't exist + if config.image_directory.eq(&PathBuf::from("images")) { + if config.image_directory.exists() == false { + std::fs::create_dir(&config.image_directory).unwrap(); + } + } + + // If create_config_file is true, create the config file and exit + if cli.create_config_file { + let json = json_code::make_config_json(config); + let json = json.pretty(2); + std::fs::write("config.json", json).unwrap(); + println!("fn {} - Created config file", function_name!()); + std::process::exit(0); + } + config +} diff --git a/src/download.rs b/src/download.rs new file mode 100644 index 0000000..af14cbc --- /dev/null +++ b/src/download.rs @@ -0,0 +1,36 @@ +use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG}; +use log::{error, info, warn}; + +#[named] +pub fn download_images(old_images: &Vec) -> Vec { + info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!()); + // Get the page links from the website + let page_links = web::get_page_links(&CONFIG.url, &old_images); + // Get the image data from each page of the website, keeping data on only the new images + let images: Vec = web::get_image_data(page_links); + + let mut percent = percentage::Percentage::new(images.len() as usize); + info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len()); + info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len())); + // Download and write each new image + for image in &images { + match fileio::write_image(&image, &CONFIG.image_directory) { + Ok(_) => {} + Err(error) => { + match error.kind() { + std::io::ErrorKind::AlreadyExists => { + warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash); + } + std::io::ErrorKind::Other => { + warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error); + } + _ => { + error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error); + } // No need "continue", as the image is not written + } + } + }; + percent.update(function_name!()); + } + images +} diff --git a/src/fileio.rs b/src/fileio.rs new file mode 100644 index 0000000..2de561c --- /dev/null +++ b/src/fileio.rs @@ -0,0 +1,137 @@ +use chrono::Local; +use filetime_creation::{set_file_ctime, FileTime}; +use json::JsonValue; +use log::{debug, error, info, trace}; +use md5; +use std::fs::{self, File}; +use std::io::prelude::*; +use std::io::Error; +use std::path::Path; + +use crate::named; +use crate::percentage::Percentage; +use crate::web::fetch::fetch_image_bytes; +use crate::ImageBox; + +#[named] +pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> { + // Join the image_directory path with the image title and .jpg + let image_path = image_directory.join(&image.title).with_extension(".jpg"); + // Create the image file + let mut out = File::create(&image_path)?; + // Fetch the image bytes + let mut content = match fetch_image_bytes(&image.url) { + Ok(content) => content, + Err(error) => { + return Err(Error::new( + std::io::ErrorKind::Other, + format!("Error fetching image bytes from {}: {}", &image.url, error), + )); + } + }; + // Write the image bytes to the image file + out.write_all(&mut content)?; + + // Next we need to set the creation date of the file to the date of the image + let image_time = FileTime::from_unix_time(image.date.timestamp(), 0); + set_file_ctime(&image_path, image_time)?; + trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title); + Ok(()) +} + +#[named] +pub fn read_images(image_directory: &Path) -> Vec { + // reads the image directory and returns a vector of ImageBox structs with hashes from the actual images + let mut images: Vec = Vec::new(); + // Iterate over the files in the image directory + let files = fs::read_dir(&image_directory) + .unwrap_or_else(|error| { + error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error); + panic!( + "{} - Error reading image directory: {}", + function_name!(), + error + ) + }) + .collect::>(); + let mut percent = Percentage::new(files.len()); + info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len()); + for file in files { + let file = match file { + Ok(file) => file, + Err(error) => { + error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error); + continue; + } + }; + // If the file is a jpg, read the hash from the file and add it to the vector + let jpg = match file.path().extension() { + Some(str) => { + if str == "jpg" { + true + } else { + trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display()); + false + } + } + _ => { + trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display()); + false + } + }; + if jpg { + let hash = md5::compute(std::fs::read(file.path()).unwrap()); + let image_box = ImageBox { + url: "scanned".to_string(), + date: Local::now(), + title: file.file_name().into_string().unwrap(), + hash: format!("{:x}", hash), + blacklisted: false, + }; + images.push(image_box); + trace!( + target: "w10s_webscraper", + "fn {} - Image {} read", + function_name!(), + &file.file_name().into_string().unwrap() + ); + } + percent.update(function_name!()); + } + images +} + +#[named] +pub fn read_json(image_directory: &Path) -> Result { + let path = image_directory.join("hashes.json"); + // Read the json from the file and return it + trace!( + target: "w10s_webscraper", + "fn {} - Reading json file, expect confirmation", + function_name!() + ); + let mut file = File::open(path)?; + trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!()); + + let mut buf = String::new(); + file.read_to_string(&mut buf)?; + let json = json::parse(&buf).unwrap(); + debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!()); + Ok(json) +} + +#[named] +pub fn write_json(path: &Path, json: JsonValue) { + let json = json.pretty(2); + let path = path.join("hashes.json"); + // Create file, and overwrite it if it exists + let mut file = match File::create(path) { + Ok(file) => file, + Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error), + }; + match file.write_all(json.as_bytes()) { + Ok(_) => (), + Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error), + }; + debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!()); +} diff --git a/src/image_data.rs b/src/image_data.rs new file mode 100644 index 0000000..13ee0c9 --- /dev/null +++ b/src/image_data.rs @@ -0,0 +1,41 @@ +use chrono::{DateTime, Local}; + +pub struct ImageBox { + pub url: String, + pub date: DateTime, + pub title: String, + pub hash: String, + pub blacklisted: bool, +} + +impl std::fmt::Display for ImageBox { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}", + self.url, + self.date.date_naive(), + self.title, + self.hash, + self.blacklisted + ) + } +} + +impl PartialEq for ImageBox { + fn eq(&self, other: &Self) -> bool { + self.hash == other.hash + } +} + +impl Clone for ImageBox { + fn clone(&self) -> Self { + ImageBox { + url: self.url.clone(), + date: self.date.clone(), + title: self.title.clone(), + hash: self.hash.clone(), + blacklisted: self.blacklisted.clone(), + } + } +} diff --git a/src/json_code.rs b/src/json_code.rs new file mode 100644 index 0000000..780ffff --- /dev/null +++ b/src/json_code.rs @@ -0,0 +1,107 @@ +use std::path::PathBuf; + +use crate::{image_data::ImageBox, named, Config}; +use chrono::DateTime; +use json::{array, object, JsonValue}; +use log::{debug, trace, warn}; + +/* +The json code is pretty much all just converting structs to json and back again. +*/ + +#[named] +pub fn make_config_json(config: Config) -> JsonValue { + debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!()); + let download = config.download; + let url = &config.url; + let scan = config.scan; + let image_directory = &config.image_directory; + let image_directory = image_directory.to_str().unwrap(); + let test = config.test; + let verbosity = config.verbosity; + + + let json = object! { + "download": download, + "url": url.to_string(), + "scan": scan, + "image_directory": image_directory, + "test": test, + "verbosity": verbosity, + }; + json +} + +#[named] +pub fn parse_config_json(json: JsonValue) -> Config { + debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!()); + let download = json["download"].as_bool().unwrap(); + let url = json["url"].to_string(); + let scan = json["scan"].as_bool().unwrap(); + let image_directory = json["image_directory"].to_string(); + let test = json["test"].as_bool().unwrap(); + let verbosity = json["verbosity"].as_u8().unwrap(); + + let config = Config { + download, + url, + scan, + image_directory: PathBuf::from(image_directory), + test, + verbosity, + }; + debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!()); + config +} + +#[named] +pub fn make_image_json(images: Vec) -> JsonValue { + trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!()); + let mut json = object! { + "info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#, + "images": array![] + }; + for image in images { + let image_json = object! { + "hash": image.hash, + "date_added": image.date.to_rfc2822(), + "url": image.url, + "title": image.title, + "blacklisted": image.blacklisted, + }; + json["images"].push(image_json).unwrap(); + } + trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!()); + json +} + +#[named] +pub fn parse_image_json(json: JsonValue) -> Vec { + debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!()); + let mut images: Vec = Vec::new(); + for image in json["images"].members() { + let image_box = ImageBox { + url: image["url"].to_string(), + date: DateTime::from( + match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) { + Ok(date) => date, + Err(error) => { + warn!( + target: "w10s_webscraper", + "{} - Error parsing date, defaulting to unix 0: {}", + function_name!(), + error + ); + DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap() + } + }, + ), + title: image["title"].to_string(), + hash: image["hash"].to_string(), + blacklisted: image["blacklisted"].as_bool().unwrap(), + }; + images.push(image_box); + } + debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!()); + images +} diff --git a/src/logging.rs b/src/logging.rs new file mode 100644 index 0000000..c5e40e6 --- /dev/null +++ b/src/logging.rs @@ -0,0 +1,56 @@ +use crate::CONFIG; +use log::LevelFilter; +use log4rs::append::console::ConsoleAppender; +use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller; +use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger; +use log4rs::append::rolling_file::policy::compound::CompoundPolicy; +use log4rs::append::rolling_file::RollingFileAppender; +use log4rs::config::{Appender, Config, Logger, Root}; +use log4rs::encode::pattern::PatternEncoder; +use log4rs::filter::threshold::ThresholdFilter; +//use crate::CONFIG; + +pub fn initialize_logging() -> log4rs::Handle { + // Initialize logging + + let level: LevelFilter = match &CONFIG.verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + 2 => LevelFilter::Trace, + _ => LevelFilter::Info, + }; + let stdout = ConsoleAppender::builder() + .encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}"))) + .build(); // This appender is filtered, but only later + + let roller = FixedWindowRoller::builder() + .build("log/my{}.log", 50) + .unwrap(); + let policy: CompoundPolicy = + CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller)); + let file_logger = RollingFileAppender::builder() + .encoder(Box::new(PatternEncoder::new( + "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}", + ))) + .build("log/my.log", Box::new(policy)) + .unwrap(); + + let config = Config::builder() + .appender( + Appender::builder() + .filter(Box::new(ThresholdFilter::new(level))) // This is the filter + .build("stdout", Box::new(stdout)), + ) + .appender(Appender::builder().build("file_logger", Box::new(file_logger))) + .logger( + Logger::builder() + .additive(false) // If additive is true, you get double output from the stdout appender + .appender("stdout") + .appender("file_logger") + .build("w10s_webscraper", LevelFilter::Trace), + ) + .build(Root::builder().appender("stdout").build(LevelFilter::Warn)) + .unwrap(); + let handle = log4rs::init_config(config).unwrap(); + handle +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..9435c6c --- /dev/null +++ b/src/main.rs @@ -0,0 +1,82 @@ +use argparse::Config; +pub use function_name::named; +use json::JsonValue; +use lazy_static::lazy_static; +pub use log::{debug, error, info, trace, warn}; +pub use log::{Level, LevelFilter}; + +mod download; +mod fileio; +mod image_data; +mod logging; +pub mod percentage; +mod scan; +pub use image_data::ImageBox; +pub mod argparse; +mod json_code; +mod web; +use json_code::make_image_json; + +lazy_static! { + static ref CONFIG: Config = argparse::parse_args(); +} + +#[named] +fn main() { + logging::initialize_logging(); + + info!(target: "w10s_webscraper", "{} - Starting", function_name!()); + trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!()); + // Load json database of existing images in the image directory + let json = match fileio::read_json(&CONFIG.image_directory) { + Ok(json) => json, + Err(error) => { + if error.kind() == std::io::ErrorKind::NotFound { + warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!()); + JsonValue::new_object() + } else { + error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error); + panic!("{} - Error reading json file: {}", function_name!(), error); + } + } + }; + // Parse json into a vector of ImageBox structs + let mut old_images: Vec = json_code::parse_image_json(json); + trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!()); + + // Create an empty vector of ImageBox structs to hold the new images + let mut images: Vec = Vec::new(); + + trace!(target: "w10s_webscraper", + "{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both", + function_name!(), + CONFIG.scan + ); + // Determine if we need to download images, scan the image directory, or both + if CONFIG.scan { + old_images = scan::scan(&mut old_images); + } + if CONFIG.download { + images = download::download_images(&old_images); + } + + trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!()); + // Merge the old and new image data + images.append(&mut old_images); + trace!(target: "w10s_webscraper", "{} - Writing json", function_name!()); + + // Write the new json file + let json = make_image_json(images); + trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!()); + fileio::write_json(&CONFIG.image_directory, json); + info!(target: "w10s_webscraper", "{} - Finished", function_name!()); +} + +pub fn est_time(vector: usize) -> String { + let est_time = (vector as f64 * 5.1) / 60 as f64; + if est_time > 120.0 { + format!("{} hours", est_time / 60.0) + } else { + format!("{} minutes", est_time) + } +} diff --git a/src/percentage.rs b/src/percentage.rs new file mode 100644 index 0000000..9bbd73c --- /dev/null +++ b/src/percentage.rs @@ -0,0 +1,69 @@ +use function_name::named; +use log::{debug, info}; +pub struct Percentage { + /* + This struct is used to track the percentage of a task that has been completed. + It is used to print progress to the console. + */ + threshold: f32, + step_size: f32, + total: usize, + count: usize, + percent: f32, +} + +impl Percentage { + #[named] + pub fn new(total: usize) -> Percentage { + let mut step = 0.1; + let mut threshold = 0.1; + if total == 0 { + debug!( + "fn {} - Percentage::new() called with total = 0", + function_name!() + ); + } + // If there are less than 10 items, set the step size to whatever percentage of the total each item represents + if total < 10 { + step = 1.0 / total as f32; + } + // If there are greater than 100 items, set the step size and threshold to 5% + if total > 100 { + step = 0.05; + threshold = 0.05; + } + // If there are greater than 1000 items, set the step size and threshold to 1% + if total > 1000 { + step = 0.01; + threshold = 0.01; + } + Percentage { + threshold, + step_size: step, + total, + count: 0, + percent: 0.0, + } + } + pub fn get_percent(&self) -> f32 { + self.percent + } + pub fn get_total(&self) -> usize { + self.total + } + pub fn update(&mut self, fn_name: &str) { + if self.total == 0 { + info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name); + } + // Update the progress + self.count += 1; + // Calculate the percentage + let percent: f32 = self.count as f32 / self.total as f32; + // If the percentage is greater than the threshold, print the percentage + if percent >= self.threshold { + info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0); + // Update the threshold + self.threshold += self.step_size; + } + } +} diff --git a/src/scan.rs b/src/scan.rs new file mode 100644 index 0000000..18be4ad --- /dev/null +++ b/src/scan.rs @@ -0,0 +1,53 @@ +use crate::{named, CONFIG, ImageBox, fileio}; +use log::info; + +#[named] +pub fn scan(old_images: &mut Vec) -> Vec { + // Scan image directory for existing images + info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!()); + let scanned_images: Vec = fileio::read_images(&CONFIG.image_directory); + if scanned_images.len() == 0 { + info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!()); + return old_images.to_vec(); + } + let mut indexies_to_remove: Vec = Vec::new(); + let mut pos: usize = 0; + // Determine the json entries that are not present in the directory + for image in &*old_images { + let mut is_present = false; + for scanned_image in &scanned_images { + if image.hash == scanned_image.hash { + is_present = true; + } + } + if !is_present && image.blacklisted == false { + indexies_to_remove.push(pos) + } + pos += 1; + } + // Remove the entries from the json + // Remove in reverse order to avoid index issues + indexies_to_remove.reverse(); + for index in indexies_to_remove { + old_images.remove(index); + } + info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!()); + + // Add images that are in the directory, but not in the json + for image in &scanned_images { + let mut is_old = false; + for old_image in &*old_images { + if image.hash == old_image.hash { + is_old = true; + break; + } + } + + if !is_old { + old_images.push(image.clone()); + } + } + info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!()); + + old_images.to_vec() +} \ No newline at end of file diff --git a/src/web.rs b/src/web.rs new file mode 100644 index 0000000..c987c4c --- /dev/null +++ b/src/web.rs @@ -0,0 +1,84 @@ +use crate::ImageBox; +use crate::CONFIG; +pub mod fetch; +pub mod html; +use crate::percentage::Percentage; +use crate::{est_time, named}; +use log::{error, info}; + +#[named] +pub fn get_page_links(url: &str, old_images: &Vec) -> Vec { + info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!()); + let html = fetch::fetch_html(url).unwrap_or_else(|error| { + panic!("Problem fetching primary page: {}", error); + }); + let mut page_links: Vec = html::extract_image_page_links(&html, &old_images); + if CONFIG.test == false { + // if debug is false, then we want to get all of the pages + let page_count = html::extract_page_count(html); + let mut percent = Percentage::new(page_count as usize); + info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count); + info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize)); + let mut fully_skipped_page_count = 0; + for i in 2..page_count { + let url = format!("{}page/{}/", url, i); + let html = match fetch::fetch_html(&url) { + Ok(html) => html, + Err(error) => { + error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error); + continue; + } + }; + let mut new_links: Vec = + html::extract_image_page_links(&html, &old_images).to_vec(); + + // If we get no new links five pages in a row, then we can skip the rest of the pages + if new_links.len() == 0 { + fully_skipped_page_count += 1; + } + if new_links.len() > 0 { + fully_skipped_page_count = 0; + } + if fully_skipped_page_count > 5 { + info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!()); + break; + } + + page_links.append(&mut new_links); + percent.update(function_name!()); + } + } + + page_links +} + +#[named] +pub fn get_image_data(urls: Vec) -> Vec { + let mut percent = Percentage::new(urls.len()); + info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len()); + info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len())); + let mut images: Vec = Vec::new(); + for url in urls { + let html = match fetch::fetch_html(&url) { + Ok(html) => html, + Err(error) => { + error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error); + continue; + } + }; + let image_link = html::extract_image_url(&html); + let image_title = html::extract_image_title(&html); + let image_date = html::extract_image_date(&html); + let image_hash = url.split("/").last().unwrap().to_string(); + let image = ImageBox { + url: image_link, + date: image_date, + title: image_title, + hash: image_hash, + blacklisted: false, + }; + images.push(image); + percent.update(function_name!()); + } + images +} diff --git a/src/web/fetch.rs b/src/web/fetch.rs new file mode 100644 index 0000000..25a0d38 --- /dev/null +++ b/src/web/fetch.rs @@ -0,0 +1,74 @@ +use crate::named; +use lazy_static::lazy_static; +use log::{debug, trace}; +use reqwest::Error; +use scraper::Html; +use std::sync::Mutex; +use std::time::Instant; + +/* + do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes +*/ + +lazy_static! { + static ref LAST_REQUEST_MUTEX: Mutex> = Mutex::new(None); + static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500); +} + +pub fn do_throttled_request(url: &str) -> Result { + fn delay() { + let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap(); + let last_request = last_request_mutex.take(); + //let now = Instant::now(); + + if let Some(last_request) = last_request { + let duration = last_request.duration_since(last_request); + if duration < *REQUEST_DELAY { + std::thread::sleep(*REQUEST_DELAY - duration); + } + } + } + + // First request + delay(); + let mut resp = reqwest::blocking::get(url); + LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now()); + // Retry 5 times + if resp.is_err() { + for i in 0..5 { + delay(); + resp = reqwest::blocking::get(url); + LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now()); + + if resp.is_ok() { + break; + } + if i == 4 { + return resp; // Return error after 5 tries + } + } + } + resp +} + +#[named] +pub fn fetch_html(url: &str) -> Result { + trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url); + let resp = match do_throttled_request(url) { + Ok(resp) => resp, + Err(e) => { + debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url); + return Err(e); + } + }; + + let html = resp.text().unwrap(); + let html = Html::parse_document(&html); + Ok(html) +} + +pub fn fetch_image_bytes(url: &str) -> Result, reqwest::Error> { + let resp = do_throttled_request(url)?; + let bytes = resp.bytes()?; + Ok(bytes.to_vec()) +} diff --git a/src/web/html.rs b/src/web/html.rs new file mode 100644 index 0000000..8aa080d --- /dev/null +++ b/src/web/html.rs @@ -0,0 +1,143 @@ +use chrono::{DateTime, Local}; +use log::{debug, error, warn}; +use scraper::{Html, Selector}; + +use crate::{named, ImageBox}; + +#[named] +pub fn extract_page_count(html: Html) -> i32 { + let mut page_count: i32 = 0; + // select only the links that are page numbers + let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap(); + let links = html.select(&selector); + // iterate over the links and find the highest page number + for link in links { + let href = link.value().attr("href").unwrap(); + // get the last element of the link, which is the page number + let raw = href.split("/").collect::>(); + let last = raw.last().unwrap(); + let last = last.parse::().unwrap(); // cast the last element to an i32 + if last > page_count { + page_count = last; + } + } + debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count); + page_count +} + +#[named] +pub fn extract_image_page_links(html: &Html, old_images: &Vec) -> Vec { + let mut page_links: Vec = Vec::new(); + let selector = Selector::parse(r#"a[href]"#).unwrap(); + let links = html.select(&selector); + for link in links { + // get the href attribute + let href: String = link.value().attr("href").unwrap().to_string(); + // get the hash in the link + let hash: String = href + .split("/") + .collect::>() + .last() + .unwrap() + .to_string(); + // if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list + let image = old_images.iter().find(|&x| x.hash == hash); + if href.contains("/images/") + && href.contains("#respond") == false + && href.contains("#comments") == false + && page_links.contains(&href) == false + && image == None + { + page_links.push(href.to_string()); + } + } + debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links); + page_links +} + +pub fn extract_image_url(html: &Html) -> String { + // construct a CSS selector that will grab all of the image tags + // This selector is not the html snippets themselves, but rather an object that knows how to select them + let selector = Selector::parse("img").unwrap(); + // use the selector to find all img tags in the document + let images = html.select(&selector); + // iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable + let mut output: String = String::new(); + for image in images { + // get the raw src attribute of the image tag + let src = image.value().attr("src").unwrap(); + // output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576" + if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") { + //println!("{}", src); + // split the src attribute into a vector of strings, using the "-" character as the delimiter + let tempvec = src.split("-").collect::>(); + // create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end + let mut temp_s = String::new(); + temp_s.push_str(tempvec[0]); + // this keeps the '-' in ".com/wp-content/upl" + temp_s.push_str("-"); + temp_s.push_str(tempvec[1]); + temp_s.push_str(".jpg"); + output = temp_s; + //print!("{}", output) + } + } + output +} + +#[named] +pub fn extract_image_date(html: &Html) -> DateTime { + let selector = Selector::parse(r#"span[class="date"]"#).unwrap(); + let html_dates = html.select(&selector); + let mut dates: Vec = Vec::new(); + for date in html_dates { + let date = date.text().collect::>(); + dates.push(date[0].to_string()) + } + if dates.len() > 1 { + warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!()); + } + // date comes out of the html as "2020-01-01", but we need to add the time and timezone to it + // so we can parse it into a DateTime object + let mut datetime: String = dates[0].to_string(); + datetime.push_str(" 12:00:00 -0500"); + let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") { + Ok(datetime) => datetime.with_timezone(&Local), + Err(e) => { + error!( + target: "w10s_webscraper", + "{} - Error parsing date, using local now: {}", + function_name!(), + e + ); + Local::now() + } + }; + datetime +} + +#[named] +pub fn extract_image_title(html: &Html) -> String { + let selector = Selector::parse(r#"title"#).unwrap(); + let titles = html.select(&selector); + let mut output: Vec = Vec::new(); + for title in titles { + let title = title.text().collect::>(); + output.push(title[0].to_string()) + } + if output.len() > 1 { + warn!( + target: "w10s_webscraper", + "{} - More than one title found. Using the first one ({})", + function_name!(), + output[0] + ); + } + + output[0] + .split(" | ") + .collect::>() + .first() + .unwrap() + .to_string() +}