Initial commit

2026-07-17 23:12:41 +00:00 · 2024-02-08 12:53:40 -05:00
commit d724975359
21 changed files with 1090 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
 # Auto detect text files and perform LF normalization
 * text=auto
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,14 @@
 # Generated by Cargo
 # will have compiled files and executables
 debug/
 target/
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # These are backup files generated by rustfmt
 **/*.rs.bk
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
 {
    "rust-analyzer.linkedProjects": [
        ".\\Cargo.toml"
    ]
 }
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,35 @@
 [package]
 author = "Nayan Sawyer"
 name = "w10s_webscraper"
 version = "0.1.0"
 description = "----\n\nAn example webscraper\nfolder defaults to current directory"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 chrono = { version = "0.4.26", features = ["clock"] }
 clap = { version = "4.3.9", features = ["derive"] }
 env_logger = "0.10.0"
 filetime_creation = "0.1.5"
 function_name = "0.3.0"
 hex-literal = "0.4.1"
 json = "0.12.4"
 lazy_static = "1.4.0"
 log = "0.4.19"
 log4rs = "1.2.0"
 md5 = "0.7.0"
 reqwest = { version = "0.11.18", features = ["blocking"] }
 scraper = "0.16.0"
 serde-value = "0.7.0"
 [profile.dev]
 opt-level = 0
 debug = true
 debug-assertions = true
 overflow-checks = true
 lto = false
 panic = 'unwind'
 incremental = true
 codegen-units = 256
 rpath = false
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2024 NSawyer
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,2 @@
 # webscraper_clean
 An example webscraper written in Rust
--- a/config.json
+++ b/config.json
@@ -0,0 +1,7 @@
 {
  "download": true,
  "url": "",
  "scan": true,
  "image_directory": "images",
  "test": true
 }
--- a/log/my.log
+++ b/log/my.log
--- a/notes.txt
+++ b/notes.txt
@@ -0,0 +1,14 @@
 arguments
 debug
 target directory
 scan directory
 -vertical
 File format
 ------------
 name:
 hash:
 date_added:
 blacklist:
--- a/src/argparse.rs
+++ b/src/argparse.rs
@@ -0,0 +1,108 @@
 use crate::{json_code, named};
 use clap::Parser;
 use std::path::PathBuf;
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[derive(Debug)]
 pub struct Cli {
    #[arg(short, long, value_name = "FILE")]
    pub folder: Option<PathBuf>,
    #[arg(short, long, value_name = "URL")]
    pub url: Option<String>,
    #[arg(short, long)]
    pub scan: bool,
    // Turn debugging information on
    #[arg(short, long, action = clap::ArgAction::Count)]
    pub verbosity: u8,
    #[arg(short, long)]
    pub download: bool,
    // Turn testing mode on
    #[arg(short, long)]
    pub test: bool,
    #[arg(short, long)]
    pub create_config_file: bool,
 }
 #[derive(Debug)]
 pub struct Config {
    pub verbosity: u8,
    pub url: String,
    pub scan: bool,
    pub image_directory: PathBuf,
    pub download: bool,
    pub test: bool,
 }
 #[named]
 pub fn parse_args() -> Config {
    // If config file is present, read it and use it to override the command line arguments
    if let Ok(json) = std::fs::read_to_string("config.json") {
        let json: json::JsonValue = match json::parse(&json) {
            Ok(json) => json,
            Err(e) => {
                println!("fn {} - Error parsing config.json: {}", function_name!(), e);
                std::process::exit(1);
            }
        };
        let config = json_code::parse_config_json(json);
        return config;
    }
    // Parse command line arguments
    let cli = Cli::parse();
    let mut config = Config {
        download: false,
        url: String::from(""),
        scan: false,
        image_directory: PathBuf::from("images"),
        test: false,
        verbosity: 0,
    };
    if cli.scan == false && cli.download == false {
        println!(
            "fn {} - You must specify either --scan, --download, or --help",
            function_name!()
        );
        std::process::exit(1);
    }
    if let Some(path) = cli.folder.as_deref() {
        config.image_directory = path.to_path_buf();
    }
    if cli.scan {
        config.scan = cli.scan;
    }
    if let Some(url) = cli.url.as_deref() {
        config.url = url.to_string();
    }
    if cli.test {
        config.test = cli.test;
    }
    if cli.download {
        config.download = cli.download;
    }
    // If the image directory is the default, create it if it doesn't exist
    if config.image_directory.eq(&PathBuf::from("images")) {
        if config.image_directory.exists() == false {
            std::fs::create_dir(&config.image_directory).unwrap();
        }
    }
    // If create_config_file is true, create the config file and exit
    if cli.create_config_file {
        let json = json_code::make_config_json(config);
        let json = json.pretty(2);
        std::fs::write("config.json", json).unwrap();
        println!("fn {} - Created config file", function_name!());
        std::process::exit(0);
    }
    config
 }
--- a/src/download.rs
+++ b/src/download.rs
@@ -0,0 +1,36 @@
 use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG};
 use log::{error, info, warn};
 #[named]
 pub fn download_images(old_images: &Vec<ImageBox>) -> Vec<ImageBox> {
    info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!());
    // Get the page links from the website
    let page_links = web::get_page_links(&CONFIG.url, &old_images);
    // Get the image data from each page of the website, keeping data on only the new images
    let images: Vec<ImageBox> = web::get_image_data(page_links);
    let mut percent = percentage::Percentage::new(images.len() as usize);
    info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len());
    info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len()));
    // Download and write each new image
    for image in &images {
        match fileio::write_image(&image, &CONFIG.image_directory) {
            Ok(_) => {}
            Err(error) => {
                match error.kind() {
                    std::io::ErrorKind::AlreadyExists => {
                        warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash);
                    }
                    std::io::ErrorKind::Other => {
                        warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error);
                    }
                    _ => {
                        error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error);
                    } // No need "continue", as the image is not written
                }
            }
        };
        percent.update(function_name!());
    }
    images
 }
--- a/src/fileio.rs
+++ b/src/fileio.rs
@@ -0,0 +1,137 @@
 use chrono::Local;
 use filetime_creation::{set_file_ctime, FileTime};
 use json::JsonValue;
 use log::{debug, error, info, trace};
 use md5;
 use std::fs::{self, File};
 use std::io::prelude::*;
 use std::io::Error;
 use std::path::Path;
 use crate::named;
 use crate::percentage::Percentage;
 use crate::web::fetch::fetch_image_bytes;
 use crate::ImageBox;
 #[named]
 pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> {
    // Join the image_directory path with the image title and .jpg
    let image_path = image_directory.join(&image.title).with_extension(".jpg");
    // Create the image file
    let mut out = File::create(&image_path)?;
    // Fetch the image bytes
    let mut content = match fetch_image_bytes(&image.url) {
        Ok(content) => content,
        Err(error) => {
            return Err(Error::new(
                std::io::ErrorKind::Other,
                format!("Error fetching image bytes from {}: {}", &image.url, error),
            ));
        }
    };
    // Write the image bytes to the image file
    out.write_all(&mut content)?;
    // Next we need to set the creation date of the file to the date of the image
    let image_time = FileTime::from_unix_time(image.date.timestamp(), 0);
    set_file_ctime(&image_path, image_time)?;
    trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title);
    Ok(())
 }
 #[named]
 pub fn read_images(image_directory: &Path) -> Vec<ImageBox> {
    // reads the image directory and returns a vector of ImageBox structs with hashes from the actual images
    let mut images: Vec<ImageBox> = Vec::new();
    // Iterate over the files in the image directory
    let files = fs::read_dir(&image_directory)
        .unwrap_or_else(|error| {
            error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error);
            panic!(
                "{} - Error reading image directory: {}",
                function_name!(),
                error
            )
        })
        .collect::<Vec<_>>();
    let mut percent = Percentage::new(files.len());
    info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len());
    for file in files {
        let file = match file {
            Ok(file) => file,
            Err(error) => {
                error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error);
                continue;
            }
        };
        // If the file is a jpg, read the hash from the file and add it to the vector
        let jpg = match file.path().extension() {
            Some(str) => {
                if str == "jpg" {
                    true
                } else {
                    trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display());
                    false
                }
            }
            _ => {
                trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display());
                false
            }
        };
        if jpg {
            let hash = md5::compute(std::fs::read(file.path()).unwrap());
            let image_box = ImageBox {
                url: "scanned".to_string(),
                date: Local::now(),
                title: file.file_name().into_string().unwrap(),
                hash: format!("{:x}", hash),
                blacklisted: false,
            };
            images.push(image_box);
            trace!(
                target: "w10s_webscraper",
                "fn {} - Image {} read",
                function_name!(),
                &file.file_name().into_string().unwrap()
            );
        }
        percent.update(function_name!());
    }
    images
 }
 #[named]
 pub fn read_json(image_directory: &Path) -> Result<JsonValue, Error> {
    let path = image_directory.join("hashes.json");
    // Read the json from the file and return it
    trace!(
        target: "w10s_webscraper",
        "fn {} - Reading json file, expect confirmation",
        function_name!()
    );
    let mut file = File::open(path)?;
    trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!());
    let mut buf = String::new();
    file.read_to_string(&mut buf)?;
    let json = json::parse(&buf).unwrap();
    debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!());
    Ok(json)
 }
 #[named]
 pub fn write_json(path: &Path, json: JsonValue) {
    let json = json.pretty(2);
    let path = path.join("hashes.json");
    // Create file, and overwrite it if it exists
    let mut file = match File::create(path) {
        Ok(file) => file,
        Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error),
    };
    match file.write_all(json.as_bytes()) {
        Ok(_) => (),
        Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error),
    };
    debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!());
 }
--- a/src/image_data.rs
+++ b/src/image_data.rs
@@ -0,0 +1,41 @@
 use chrono::{DateTime, Local};
 pub struct ImageBox {
    pub url: String,
    pub date: DateTime<Local>,
    pub title: String,
    pub hash: String,
    pub blacklisted: bool,
 }
 impl std::fmt::Display for ImageBox {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(
            f,
            "{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}",
            self.url,
            self.date.date_naive(),
            self.title,
            self.hash,
            self.blacklisted
        )
    }
 }
 impl PartialEq for ImageBox {
    fn eq(&self, other: &Self) -> bool {
        self.hash == other.hash
    }
 }
 impl Clone for ImageBox {
    fn clone(&self) -> Self {
        ImageBox {
            url: self.url.clone(),
            date: self.date.clone(),
            title: self.title.clone(),
            hash: self.hash.clone(),
            blacklisted: self.blacklisted.clone(),
        }
    }
 }
--- a/src/json_code.rs
+++ b/src/json_code.rs
@@ -0,0 +1,107 @@
 use std::path::PathBuf;
 use crate::{image_data::ImageBox, named, Config};
 use chrono::DateTime;
 use json::{array, object, JsonValue};
 use log::{debug, trace, warn};
 /*
 The json code is pretty much all just converting structs to json and back again.
 */
 #[named]
 pub fn make_config_json(config: Config) -> JsonValue {
    debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!());
    let download = config.download;
    let url = &config.url;
    let scan = config.scan;
    let image_directory = &config.image_directory;
    let image_directory = image_directory.to_str().unwrap();
    let test = config.test;
    let verbosity = config.verbosity;
    let json = object! {
        "download": download,
        "url": url.to_string(),
        "scan": scan,
        "image_directory": image_directory,
        "test": test,
        "verbosity": verbosity,
    };
    json
 }
 #[named]
 pub fn parse_config_json(json: JsonValue) -> Config {
    debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!());
    let download = json["download"].as_bool().unwrap();
    let url = json["url"].to_string();
    let scan = json["scan"].as_bool().unwrap();
    let image_directory = json["image_directory"].to_string();
    let test = json["test"].as_bool().unwrap();
    let verbosity = json["verbosity"].as_u8().unwrap();
    let config = Config {
        download,
        url,
        scan,
        image_directory: PathBuf::from(image_directory),
        test,
        verbosity,
    };
    debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
    config
 }
 #[named]
 pub fn make_image_json(images: Vec<ImageBox>) -> JsonValue {
    trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!());
    let mut json = object! {
        "info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#,
        "images": array![]
    };
    for image in images {
        let image_json = object! {
            "hash": image.hash,
            "date_added": image.date.to_rfc2822(),
            "url": image.url,
            "title": image.title,
            "blacklisted": image.blacklisted,
        };
        json["images"].push(image_json).unwrap();
    }
    trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!());
    json
 }
 #[named]
 pub fn parse_image_json(json: JsonValue) -> Vec<ImageBox> {
    debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!());
    let mut images: Vec<ImageBox> = Vec::new();
    for image in json["images"].members() {
        let image_box = ImageBox {
            url: image["url"].to_string(),
            date: DateTime::from(
                match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) {
                    Ok(date) => date,
                    Err(error) => {
                        warn!(
                            target: "w10s_webscraper",
                            "{} - Error parsing date, defaulting to unix 0: {}",
                            function_name!(),
                            error
                        );
                        DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap()
                    }
                },
            ),
            title: image["title"].to_string(),
            hash: image["hash"].to_string(),
            blacklisted: image["blacklisted"].as_bool().unwrap(),
        };
        images.push(image_box);
    }
    debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
    images
 }
--- a/src/logging.rs
+++ b/src/logging.rs
@@ -0,0 +1,56 @@
 use crate::CONFIG;
 use log::LevelFilter;
 use log4rs::append::console::ConsoleAppender;
 use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller;
 use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger;
 use log4rs::append::rolling_file::policy::compound::CompoundPolicy;
 use log4rs::append::rolling_file::RollingFileAppender;
 use log4rs::config::{Appender, Config, Logger, Root};
 use log4rs::encode::pattern::PatternEncoder;
 use log4rs::filter::threshold::ThresholdFilter;
 //use crate::CONFIG;
 pub fn initialize_logging() -> log4rs::Handle {
    // Initialize logging
    let level: LevelFilter = match &CONFIG.verbosity {
        0 => LevelFilter::Info,
        1 => LevelFilter::Debug,
        2 => LevelFilter::Trace,
        _ => LevelFilter::Info,
    };
    let stdout = ConsoleAppender::builder()
        .encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}")))
        .build(); // This appender is filtered, but only later
    let roller = FixedWindowRoller::builder()
        .build("log/my{}.log", 50)
        .unwrap();
    let policy: CompoundPolicy =
        CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller));
    let file_logger = RollingFileAppender::builder()
        .encoder(Box::new(PatternEncoder::new(
            "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}",
        )))
        .build("log/my.log", Box::new(policy))
        .unwrap();
    let config = Config::builder()
        .appender(
            Appender::builder()
                .filter(Box::new(ThresholdFilter::new(level))) // This is the filter
                .build("stdout", Box::new(stdout)),
        )
        .appender(Appender::builder().build("file_logger", Box::new(file_logger)))
        .logger(
            Logger::builder()
                .additive(false) // If additive is true, you get double output from the stdout appender
                .appender("stdout")
                .appender("file_logger")
                .build("w10s_webscraper", LevelFilter::Trace),
        )
        .build(Root::builder().appender("stdout").build(LevelFilter::Warn))
        .unwrap();
    let handle = log4rs::init_config(config).unwrap();
    handle
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,82 @@
 use argparse::Config;
 pub use function_name::named;
 use json::JsonValue;
 use lazy_static::lazy_static;
 pub use log::{debug, error, info, trace, warn};
 pub use log::{Level, LevelFilter};
 mod download;
 mod fileio;
 mod image_data;
 mod logging;
 pub mod percentage;
 mod scan;
 pub use image_data::ImageBox;
 pub mod argparse;
 mod json_code;
 mod web;
 use json_code::make_image_json;
 lazy_static! {
    static ref CONFIG: Config = argparse::parse_args();
 }
 #[named]
 fn main() {
    logging::initialize_logging();
    info!(target: "w10s_webscraper", "{} - Starting", function_name!());
    trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!());
    // Load json database of existing images in the image directory
    let json = match fileio::read_json(&CONFIG.image_directory) {
        Ok(json) => json,
        Err(error) => {
            if error.kind() == std::io::ErrorKind::NotFound {
                warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!());
                JsonValue::new_object()
            } else {
                error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error);
                panic!("{} - Error reading json file: {}", function_name!(), error);
            }
        }
    };
    // Parse json into a vector of ImageBox structs
    let mut old_images: Vec<ImageBox> = json_code::parse_image_json(json);
    trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!());
    // Create an empty vector of ImageBox structs to hold the new images
    let mut images: Vec<ImageBox> = Vec::new();
    trace!(target: "w10s_webscraper",
        "{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both",
        function_name!(),
        CONFIG.scan
    );
    // Determine if we need to download images, scan the image directory, or both
    if CONFIG.scan {
        old_images = scan::scan(&mut old_images);
    }
    if CONFIG.download {
        images = download::download_images(&old_images);
    }
    trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!());
    // Merge the old and new image data
    images.append(&mut old_images);
    trace!(target: "w10s_webscraper", "{} - Writing json", function_name!());
    // Write the new json file
    let json = make_image_json(images);
    trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!());
    fileio::write_json(&CONFIG.image_directory, json);
    info!(target: "w10s_webscraper", "{} - Finished", function_name!());
 }
 pub fn est_time(vector: usize) -> String {
    let est_time = (vector as f64 * 5.1) / 60 as f64;
    if est_time > 120.0 {
        format!("{} hours", est_time / 60.0)
    } else {
        format!("{} minutes", est_time)
    }
 }
--- a/src/percentage.rs
+++ b/src/percentage.rs
@@ -0,0 +1,69 @@
 use function_name::named;
 use log::{debug, info};
 pub struct Percentage {
    /*
    This struct is used to track the percentage of a task that has been completed.
    It is used to print progress to the console.
    */
    threshold: f32,
    step_size: f32,
    total: usize,
    count: usize,
    percent: f32,
 }
 impl Percentage {
    #[named]
    pub fn new(total: usize) -> Percentage {
        let mut step = 0.1;
        let mut threshold = 0.1;
        if total == 0 {
            debug!(
                "fn {} - Percentage::new() called with total = 0",
                function_name!()
            );
        }
        // If there are less than 10 items, set the step size to whatever percentage of the total each item represents
        if total < 10 {
            step = 1.0 / total as f32;
        }
        // If there are greater than 100 items, set the step size and threshold to 5%
        if total > 100 {
            step = 0.05;
            threshold = 0.05;
        }
        // If there are greater than 1000 items, set the step size and threshold to 1%
        if total > 1000 {
            step = 0.01;
            threshold = 0.01;
        }
        Percentage {
            threshold,
            step_size: step,
            total,
            count: 0,
            percent: 0.0,
        }
    }
    pub fn get_percent(&self) -> f32 {
        self.percent
    }
    pub fn get_total(&self) -> usize {
        self.total
    }
    pub fn update(&mut self, fn_name: &str) {
        if self.total == 0 {
            info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name);
        }
        // Update the progress
        self.count += 1;
        // Calculate the percentage
        let percent: f32 = self.count as f32 / self.total as f32;
        // If the percentage is greater than the threshold, print the percentage
        if percent >= self.threshold {
            info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0);
            // Update the threshold
            self.threshold += self.step_size;
        }
    }
 }
--- a/src/scan.rs
+++ b/src/scan.rs
@@ -0,0 +1,53 @@
 use crate::{named, CONFIG, ImageBox, fileio};
 use log::info;
 #[named]
 pub fn scan(old_images: &mut Vec<ImageBox>) -> Vec<ImageBox> {
    // Scan image directory for existing images
    info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!());
    let scanned_images: Vec<ImageBox> = fileio::read_images(&CONFIG.image_directory);
    if scanned_images.len() == 0 {
        info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!());
        return old_images.to_vec();
    }
    let mut indexies_to_remove: Vec<usize> = Vec::new();
    let mut pos: usize = 0;
    // Determine the json entries that are not present in the directory
    for image in &*old_images {
        let mut is_present = false;
        for scanned_image in &scanned_images {
            if image.hash == scanned_image.hash {
                is_present = true;
            }
        }
        if !is_present && image.blacklisted == false {
            indexies_to_remove.push(pos)
        }
        pos += 1;
    }
    // Remove the entries from the json
    // Remove in reverse order to avoid index issues
    indexies_to_remove.reverse();
    for index in indexies_to_remove {
        old_images.remove(index);
    }
    info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!());
    // Add images that are in the directory, but not in the json
    for image in &scanned_images {
        let mut is_old = false;
        for old_image in &*old_images {
            if image.hash == old_image.hash {
                is_old = true;
                break;
            }
        }
        if !is_old {
            old_images.push(image.clone());
        }
    }
    info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!());
    old_images.to_vec()
 }
--- a/src/web.rs
+++ b/src/web.rs
@@ -0,0 +1,84 @@
 use crate::ImageBox;
 use crate::CONFIG;
 pub mod fetch;
 pub mod html;
 use crate::percentage::Percentage;
 use crate::{est_time, named};
 use log::{error, info};
 #[named]
 pub fn get_page_links(url: &str, old_images: &Vec<ImageBox>) -> Vec<String> {
    info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!());
    let html = fetch::fetch_html(url).unwrap_or_else(|error| {
        panic!("Problem fetching primary page: {}", error);
    });
    let mut page_links: Vec<String> = html::extract_image_page_links(&html, &old_images);
    if CONFIG.test == false {
        // if debug is false, then we want to get all of the pages
        let page_count = html::extract_page_count(html);
        let mut percent = Percentage::new(page_count as usize);
        info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count);
        info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize));
        let mut fully_skipped_page_count = 0;
        for i in 2..page_count {
            let url = format!("{}page/{}/", url, i);
            let html = match fetch::fetch_html(&url) {
                Ok(html) => html,
                Err(error) => {
                    error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
                    continue;
                }
            };
            let mut new_links: Vec<String> =
                html::extract_image_page_links(&html, &old_images).to_vec();
            // If we get no new links five pages in a row, then we can skip the rest of the pages
            if new_links.len() == 0 {
                fully_skipped_page_count += 1;
            }
            if new_links.len() > 0 {
                fully_skipped_page_count = 0;
            }
            if fully_skipped_page_count > 5 {
                info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!());
                break;
            }
            page_links.append(&mut new_links);
            percent.update(function_name!());
        }
    }
    page_links
 }
 #[named]
 pub fn get_image_data(urls: Vec<String>) -> Vec<ImageBox> {
    let mut percent = Percentage::new(urls.len());
    info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len());
    info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len()));
    let mut images: Vec<ImageBox> = Vec::new();
    for url in urls {
        let html = match fetch::fetch_html(&url) {
            Ok(html) => html,
            Err(error) => {
                error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
                continue;
            }
        };
        let image_link = html::extract_image_url(&html);
        let image_title = html::extract_image_title(&html);
        let image_date = html::extract_image_date(&html);
        let image_hash = url.split("/").last().unwrap().to_string();
        let image = ImageBox {
            url: image_link,
            date: image_date,
            title: image_title,
            hash: image_hash,
            blacklisted: false,
        };
        images.push(image);
        percent.update(function_name!());
    }
    images
 }
--- a/src/web/fetch.rs
+++ b/src/web/fetch.rs
@@ -0,0 +1,74 @@
 use crate::named;
 use lazy_static::lazy_static;
 use log::{debug, trace};
 use reqwest::Error;
 use scraper::Html;
 use std::sync::Mutex;
 use std::time::Instant;
 /*
   do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes
 */
 lazy_static! {
    static ref LAST_REQUEST_MUTEX: Mutex<Option<Instant>> = Mutex::new(None);
    static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500);
 }
 pub fn do_throttled_request(url: &str) -> Result<reqwest::blocking::Response, Error> {
    fn delay() {
        let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap();
        let last_request = last_request_mutex.take();
        //let now = Instant::now();
        if let Some(last_request) = last_request {
            let duration = last_request.duration_since(last_request);
            if duration < *REQUEST_DELAY {
                std::thread::sleep(*REQUEST_DELAY - duration);
            }
        }
    }
    // First request
    delay();
    let mut resp = reqwest::blocking::get(url);
    LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
    // Retry 5 times
    if resp.is_err() {
        for i in 0..5 {
            delay();
            resp = reqwest::blocking::get(url);
            LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
            if resp.is_ok() {
                break;
            }
            if i == 4 {
                return resp; // Return error after 5 tries
            }
        }
    }
    resp
 }
 #[named]
 pub fn fetch_html(url: &str) -> Result<scraper::Html, Error> {
    trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url);
    let resp = match do_throttled_request(url) {
        Ok(resp) => resp,
        Err(e) => {
            debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url);
            return Err(e);
        }
    };
    let html = resp.text().unwrap();
    let html = Html::parse_document(&html);
    Ok(html)
 }
 pub fn fetch_image_bytes(url: &str) -> Result<Vec<u8>, reqwest::Error> {
    let resp = do_throttled_request(url)?;
    let bytes = resp.bytes()?;
    Ok(bytes.to_vec())
 }
--- a/src/web/html.rs
+++ b/src/web/html.rs
@@ -0,0 +1,143 @@
 use chrono::{DateTime, Local};
 use log::{debug, error, warn};
 use scraper::{Html, Selector};
 use crate::{named, ImageBox};
 #[named]
 pub fn extract_page_count(html: Html) -> i32 {
    let mut page_count: i32 = 0;
    // select only the links that are page numbers
    let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap();
    let links = html.select(&selector);
    // iterate over the links and find the highest page number
    for link in links {
        let href = link.value().attr("href").unwrap();
        // get the last element of the link, which is the page number
        let raw = href.split("/").collect::<Vec<&str>>();
        let last = raw.last().unwrap();
        let last = last.parse::<i32>().unwrap(); // cast the last element to an i32
        if last > page_count {
            page_count = last;
        }
    }
    debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count);
    page_count
 }
 #[named]
 pub fn extract_image_page_links(html: &Html, old_images: &Vec<ImageBox>) -> Vec<String> {
    let mut page_links: Vec<String> = Vec::new();
    let selector = Selector::parse(r#"a[href]"#).unwrap();
    let links = html.select(&selector);
    for link in links {
        // get the href attribute
        let href: String = link.value().attr("href").unwrap().to_string();
        // get the hash in the link
        let hash: String = href
            .split("/")
            .collect::<Vec<&str>>()
            .last()
            .unwrap()
            .to_string();
        // if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list
        let image = old_images.iter().find(|&x| x.hash == hash);
        if href.contains("/images/")
            && href.contains("#respond") == false
            && href.contains("#comments") == false
            && page_links.contains(&href) == false
            && image == None
        {
            page_links.push(href.to_string());
        }
    }
    debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links);
    page_links
 }
 pub fn extract_image_url(html: &Html) -> String {
    // construct a CSS selector that will grab all of the image tags
    // This selector is not the html snippets themselves, but rather an object that knows how to select them
    let selector = Selector::parse("img").unwrap();
    // use the selector to find all img tags in the document
    let images = html.select(&selector);
    // iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable
    let mut output: String = String::new();
    for image in images {
        // get the raw src attribute of the image tag
        let src = image.value().attr("src").unwrap();
        // output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576"
        if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") {
            //println!("{}", src);
            // split the src attribute into a vector of strings, using the "-" character as the delimiter
            let tempvec = src.split("-").collect::<Vec<&str>>();
            // create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end
            let mut temp_s = String::new();
            temp_s.push_str(tempvec[0]);
            // this keeps the '-' in ".com/wp-content/upl"
            temp_s.push_str("-");
            temp_s.push_str(tempvec[1]);
            temp_s.push_str(".jpg");
            output = temp_s;
            //print!("{}", output)
        }
    }
    output
 }
 #[named]
 pub fn extract_image_date(html: &Html) -> DateTime<Local> {
    let selector = Selector::parse(r#"span[class="date"]"#).unwrap();
    let html_dates = html.select(&selector);
    let mut dates: Vec<String> = Vec::new();
    for date in html_dates {
        let date = date.text().collect::<Vec<_>>();
        dates.push(date[0].to_string())
    }
    if dates.len() > 1 {
        warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!());
    }
    // date comes out of the html as "2020-01-01", but we need to add the time and timezone to it
    // so we can parse it into a DateTime object
    let mut datetime: String = dates[0].to_string();
    datetime.push_str("  12:00:00 -0500");
    let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") {
        Ok(datetime) => datetime.with_timezone(&Local),
        Err(e) => {
            error!(
                target: "w10s_webscraper",
                "{} - Error parsing date, using local now: {}",
                function_name!(),
                e
            );
            Local::now()
        }
    };
    datetime
 }
 #[named]
 pub fn extract_image_title(html: &Html) -> String {
    let selector = Selector::parse(r#"title"#).unwrap();
    let titles = html.select(&selector);
    let mut output: Vec<String> = Vec::new();
    for title in titles {
        let title = title.text().collect::<Vec<_>>();
        output.push(title[0].to_string())
    }
    if output.len() > 1 {
        warn!(
            target: "w10s_webscraper",
            "{} - More than one title found. Using the first one ({})",
            function_name!(),
            output[0]
        );
    }
    output[0]
        .split(" | ")
        .collect::<Vec<&str>>()
        .first()
        .unwrap()
        .to_string()
 }
		`@@ -0,0 +1,2 @@`
							`# Auto detect text files and perform LF normalization`
							`* text=auto`
		`@@ -0,0 +1,2 @@`
							`# webscraper_clean`
							`An example webscraper written in Rust`