Initial commit

2026-07-18 07:22:41 +00:00 · 2024-02-08 12:53:40 -05:00
commit d724975359
21 changed files with 1090 additions and 0 deletions
--- a/src/argparse.rs
+++ b/src/argparse.rs
@@ -0,0 +1,108 @@
+use crate::{json_code, named};
+use clap::Parser;
+use std::path::PathBuf;
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[derive(Debug)]
+pub struct Cli {
+    #[arg(short, long, value_name = "FILE")]
+    pub folder: Option<PathBuf>,
+
+    #[arg(short, long, value_name = "URL")]
+    pub url: Option<String>,
+
+    #[arg(short, long)]
+    pub scan: bool,
+
+    // Turn debugging information on
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    pub verbosity: u8,
+
+    #[arg(short, long)]
+    pub download: bool,
+
+    // Turn testing mode on
+    #[arg(short, long)]
+    pub test: bool,
+
+    #[arg(short, long)]
+    pub create_config_file: bool,
+}
+
+#[derive(Debug)]
+pub struct Config {
+    pub verbosity: u8,
+    pub url: String,
+    pub scan: bool,
+    pub image_directory: PathBuf,
+    pub download: bool,
+    pub test: bool,
+}
+
+#[named]
+pub fn parse_args() -> Config {
+    // If config file is present, read it and use it to override the command line arguments
+    if let Ok(json) = std::fs::read_to_string("config.json") {
+        let json: json::JsonValue = match json::parse(&json) {
+            Ok(json) => json,
+            Err(e) => {
+                println!("fn {} - Error parsing config.json: {}", function_name!(), e);
+                std::process::exit(1);
+            }
+        };
+        let config = json_code::parse_config_json(json);
+        return config;
+    }
+    // Parse command line arguments
+    let cli = Cli::parse();
+    let mut config = Config {
+        download: false,
+        url: String::from(""),
+        scan: false,
+        image_directory: PathBuf::from("images"),
+        test: false,
+        verbosity: 0,
+    };
+
+    if cli.scan == false && cli.download == false {
+        println!(
+            "fn {} - You must specify either --scan, --download, or --help",
+            function_name!()
+        );
+        std::process::exit(1);
+    }
+    if let Some(path) = cli.folder.as_deref() {
+        config.image_directory = path.to_path_buf();
+    }
+    if cli.scan {
+        config.scan = cli.scan;
+    }
+    if let Some(url) = cli.url.as_deref() {
+        config.url = url.to_string();
+    }
+    if cli.test {
+        config.test = cli.test;
+    }
+
+    if cli.download {
+        config.download = cli.download;
+    }
+
+    // If the image directory is the default, create it if it doesn't exist
+    if config.image_directory.eq(&PathBuf::from("images")) {
+        if config.image_directory.exists() == false {
+            std::fs::create_dir(&config.image_directory).unwrap();
+        }
+    }
+
+    // If create_config_file is true, create the config file and exit
+    if cli.create_config_file {
+        let json = json_code::make_config_json(config);
+        let json = json.pretty(2);
+        std::fs::write("config.json", json).unwrap();
+        println!("fn {} - Created config file", function_name!());
+        std::process::exit(0);
+    }
+    config
+}
--- a/src/download.rs
+++ b/src/download.rs
@@ -0,0 +1,36 @@
+use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG};
+use log::{error, info, warn};
+
+#[named]
+pub fn download_images(old_images: &Vec<ImageBox>) -> Vec<ImageBox> {
+    info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!());
+    // Get the page links from the website
+    let page_links = web::get_page_links(&CONFIG.url, &old_images);
+    // Get the image data from each page of the website, keeping data on only the new images
+    let images: Vec<ImageBox> = web::get_image_data(page_links);
+
+    let mut percent = percentage::Percentage::new(images.len() as usize);
+    info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len());
+    info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len()));
+    // Download and write each new image
+    for image in &images {
+        match fileio::write_image(&image, &CONFIG.image_directory) {
+            Ok(_) => {}
+            Err(error) => {
+                match error.kind() {
+                    std::io::ErrorKind::AlreadyExists => {
+                        warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash);
+                    }
+                    std::io::ErrorKind::Other => {
+                        warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error);
+                    }
+                    _ => {
+                        error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error);
+                    } // No need "continue", as the image is not written
+                }
+            }
+        };
+        percent.update(function_name!());
+    }
+    images
+}
--- a/src/fileio.rs
+++ b/src/fileio.rs
@@ -0,0 +1,137 @@
+use chrono::Local;
+use filetime_creation::{set_file_ctime, FileTime};
+use json::JsonValue;
+use log::{debug, error, info, trace};
+use md5;
+use std::fs::{self, File};
+use std::io::prelude::*;
+use std::io::Error;
+use std::path::Path;
+
+use crate::named;
+use crate::percentage::Percentage;
+use crate::web::fetch::fetch_image_bytes;
+use crate::ImageBox;
+
+#[named]
+pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> {
+    // Join the image_directory path with the image title and .jpg
+    let image_path = image_directory.join(&image.title).with_extension(".jpg");
+    // Create the image file
+    let mut out = File::create(&image_path)?;
+    // Fetch the image bytes
+    let mut content = match fetch_image_bytes(&image.url) {
+        Ok(content) => content,
+        Err(error) => {
+            return Err(Error::new(
+                std::io::ErrorKind::Other,
+                format!("Error fetching image bytes from {}: {}", &image.url, error),
+            ));
+        }
+    };
+    // Write the image bytes to the image file
+    out.write_all(&mut content)?;
+
+    // Next we need to set the creation date of the file to the date of the image
+    let image_time = FileTime::from_unix_time(image.date.timestamp(), 0);
+    set_file_ctime(&image_path, image_time)?;
+    trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title);
+    Ok(())
+}
+
+#[named]
+pub fn read_images(image_directory: &Path) -> Vec<ImageBox> {
+    // reads the image directory and returns a vector of ImageBox structs with hashes from the actual images
+    let mut images: Vec<ImageBox> = Vec::new();
+    // Iterate over the files in the image directory
+    let files = fs::read_dir(&image_directory)
+        .unwrap_or_else(|error| {
+            error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error);
+            panic!(
+                "{} - Error reading image directory: {}",
+                function_name!(),
+                error
+            )
+        })
+        .collect::<Vec<_>>();
+    let mut percent = Percentage::new(files.len());
+    info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len());
+    for file in files {
+        let file = match file {
+            Ok(file) => file,
+            Err(error) => {
+                error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error);
+                continue;
+            }
+        };
+        // If the file is a jpg, read the hash from the file and add it to the vector
+        let jpg = match file.path().extension() {
+            Some(str) => {
+                if str == "jpg" {
+                    true
+                } else {
+                    trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display());
+                    false
+                }
+            }
+            _ => {
+                trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display());
+                false
+            }
+        };
+        if jpg {
+            let hash = md5::compute(std::fs::read(file.path()).unwrap());
+            let image_box = ImageBox {
+                url: "scanned".to_string(),
+                date: Local::now(),
+                title: file.file_name().into_string().unwrap(),
+                hash: format!("{:x}", hash),
+                blacklisted: false,
+            };
+            images.push(image_box);
+            trace!(
+                target: "w10s_webscraper",
+                "fn {} - Image {} read",
+                function_name!(),
+                &file.file_name().into_string().unwrap()
+            );
+        }
+        percent.update(function_name!());
+    }
+    images
+}
+
+#[named]
+pub fn read_json(image_directory: &Path) -> Result<JsonValue, Error> {
+    let path = image_directory.join("hashes.json");
+    // Read the json from the file and return it
+    trace!(
+        target: "w10s_webscraper",
+        "fn {} - Reading json file, expect confirmation",
+        function_name!()
+    );
+    let mut file = File::open(path)?;
+    trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!());
+
+    let mut buf = String::new();
+    file.read_to_string(&mut buf)?;
+    let json = json::parse(&buf).unwrap();
+    debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!());
+    Ok(json)
+}
+
+#[named]
+pub fn write_json(path: &Path, json: JsonValue) {
+    let json = json.pretty(2);
+    let path = path.join("hashes.json");
+    // Create file, and overwrite it if it exists
+    let mut file = match File::create(path) {
+        Ok(file) => file,
+        Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error),
+    };
+    match file.write_all(json.as_bytes()) {
+        Ok(_) => (),
+        Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error),
+    };
+    debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!());
+}
--- a/src/image_data.rs
+++ b/src/image_data.rs
@@ -0,0 +1,41 @@
+use chrono::{DateTime, Local};
+
+pub struct ImageBox {
+    pub url: String,
+    pub date: DateTime<Local>,
+    pub title: String,
+    pub hash: String,
+    pub blacklisted: bool,
+}
+
+impl std::fmt::Display for ImageBox {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}",
+            self.url,
+            self.date.date_naive(),
+            self.title,
+            self.hash,
+            self.blacklisted
+        )
+    }
+}
+
+impl PartialEq for ImageBox {
+    fn eq(&self, other: &Self) -> bool {
+        self.hash == other.hash
+    }
+}
+
+impl Clone for ImageBox {
+    fn clone(&self) -> Self {
+        ImageBox {
+            url: self.url.clone(),
+            date: self.date.clone(),
+            title: self.title.clone(),
+            hash: self.hash.clone(),
+            blacklisted: self.blacklisted.clone(),
+        }
+    }
+}
--- a/src/json_code.rs
+++ b/src/json_code.rs
@@ -0,0 +1,107 @@
+use std::path::PathBuf;
+
+use crate::{image_data::ImageBox, named, Config};
+use chrono::DateTime;
+use json::{array, object, JsonValue};
+use log::{debug, trace, warn};
+
+/*
+The json code is pretty much all just converting structs to json and back again.
+*/
+
+#[named]
+pub fn make_config_json(config: Config) -> JsonValue {
+    debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!());
+    let download = config.download;
+    let url = &config.url;
+    let scan = config.scan;
+    let image_directory = &config.image_directory;
+    let image_directory = image_directory.to_str().unwrap();
+    let test = config.test;
+    let verbosity = config.verbosity;
+
+
+    let json = object! {
+        "download": download,
+        "url": url.to_string(),
+        "scan": scan,
+        "image_directory": image_directory,
+        "test": test,
+        "verbosity": verbosity,
+    };
+    json
+}
+
+#[named]
+pub fn parse_config_json(json: JsonValue) -> Config {
+    debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!());
+    let download = json["download"].as_bool().unwrap();
+    let url = json["url"].to_string();
+    let scan = json["scan"].as_bool().unwrap();
+    let image_directory = json["image_directory"].to_string();
+    let test = json["test"].as_bool().unwrap();
+    let verbosity = json["verbosity"].as_u8().unwrap();
+
+    let config = Config {
+        download,
+        url,
+        scan,
+        image_directory: PathBuf::from(image_directory),
+        test,
+        verbosity,
+    };
+    debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
+    config
+}
+
+#[named]
+pub fn make_image_json(images: Vec<ImageBox>) -> JsonValue {
+    trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!());
+    let mut json = object! {
+        "info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#,
+        "images": array![]
+    };
+    for image in images {
+        let image_json = object! {
+            "hash": image.hash,
+            "date_added": image.date.to_rfc2822(),
+            "url": image.url,
+            "title": image.title,
+            "blacklisted": image.blacklisted,
+        };
+        json["images"].push(image_json).unwrap();
+    }
+    trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!());
+    json
+}
+
+#[named]
+pub fn parse_image_json(json: JsonValue) -> Vec<ImageBox> {
+    debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!());
+    let mut images: Vec<ImageBox> = Vec::new();
+    for image in json["images"].members() {
+        let image_box = ImageBox {
+            url: image["url"].to_string(),
+            date: DateTime::from(
+                match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) {
+                    Ok(date) => date,
+                    Err(error) => {
+                        warn!(
+                            target: "w10s_webscraper",
+                            "{} - Error parsing date, defaulting to unix 0: {}",
+                            function_name!(),
+                            error
+                        );
+                        DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap()
+                    }
+                },
+            ),
+            title: image["title"].to_string(),
+            hash: image["hash"].to_string(),
+            blacklisted: image["blacklisted"].as_bool().unwrap(),
+        };
+        images.push(image_box);
+    }
+    debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
+    images
+}
--- a/src/logging.rs
+++ b/src/logging.rs
@@ -0,0 +1,56 @@
+use crate::CONFIG;
+use log::LevelFilter;
+use log4rs::append::console::ConsoleAppender;
+use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller;
+use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger;
+use log4rs::append::rolling_file::policy::compound::CompoundPolicy;
+use log4rs::append::rolling_file::RollingFileAppender;
+use log4rs::config::{Appender, Config, Logger, Root};
+use log4rs::encode::pattern::PatternEncoder;
+use log4rs::filter::threshold::ThresholdFilter;
+//use crate::CONFIG;
+
+pub fn initialize_logging() -> log4rs::Handle {
+    // Initialize logging
+
+    let level: LevelFilter = match &CONFIG.verbosity {
+        0 => LevelFilter::Info,
+        1 => LevelFilter::Debug,
+        2 => LevelFilter::Trace,
+        _ => LevelFilter::Info,
+    };
+    let stdout = ConsoleAppender::builder()
+        .encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}")))
+        .build(); // This appender is filtered, but only later
+
+    let roller = FixedWindowRoller::builder()
+        .build("log/my{}.log", 50)
+        .unwrap();
+    let policy: CompoundPolicy =
+        CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller));
+    let file_logger = RollingFileAppender::builder()
+        .encoder(Box::new(PatternEncoder::new(
+            "{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}",
+        )))
+        .build("log/my.log", Box::new(policy))
+        .unwrap();
+
+    let config = Config::builder()
+        .appender(
+            Appender::builder()
+                .filter(Box::new(ThresholdFilter::new(level))) // This is the filter
+                .build("stdout", Box::new(stdout)),
+        )
+        .appender(Appender::builder().build("file_logger", Box::new(file_logger)))
+        .logger(
+            Logger::builder()
+                .additive(false) // If additive is true, you get double output from the stdout appender
+                .appender("stdout")
+                .appender("file_logger")
+                .build("w10s_webscraper", LevelFilter::Trace),
+        )
+        .build(Root::builder().appender("stdout").build(LevelFilter::Warn))
+        .unwrap();
+    let handle = log4rs::init_config(config).unwrap();
+    handle
+}
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,82 @@
+use argparse::Config;
+pub use function_name::named;
+use json::JsonValue;
+use lazy_static::lazy_static;
+pub use log::{debug, error, info, trace, warn};
+pub use log::{Level, LevelFilter};
+
+mod download;
+mod fileio;
+mod image_data;
+mod logging;
+pub mod percentage;
+mod scan;
+pub use image_data::ImageBox;
+pub mod argparse;
+mod json_code;
+mod web;
+use json_code::make_image_json;
+
+lazy_static! {
+    static ref CONFIG: Config = argparse::parse_args();
+}
+
+#[named]
+fn main() {
+    logging::initialize_logging();
+
+    info!(target: "w10s_webscraper", "{} - Starting", function_name!());
+    trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!());
+    // Load json database of existing images in the image directory
+    let json = match fileio::read_json(&CONFIG.image_directory) {
+        Ok(json) => json,
+        Err(error) => {
+            if error.kind() == std::io::ErrorKind::NotFound {
+                warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!());
+                JsonValue::new_object()
+            } else {
+                error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error);
+                panic!("{} - Error reading json file: {}", function_name!(), error);
+            }
+        }
+    };
+    // Parse json into a vector of ImageBox structs
+    let mut old_images: Vec<ImageBox> = json_code::parse_image_json(json);
+    trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!());
+
+    // Create an empty vector of ImageBox structs to hold the new images
+    let mut images: Vec<ImageBox> = Vec::new();
+
+    trace!(target: "w10s_webscraper",
+        "{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both",
+        function_name!(),
+        CONFIG.scan
+    );
+    // Determine if we need to download images, scan the image directory, or both
+    if CONFIG.scan {
+        old_images = scan::scan(&mut old_images);
+    }
+    if CONFIG.download {
+        images = download::download_images(&old_images);
+    }
+
+    trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!());
+    // Merge the old and new image data
+    images.append(&mut old_images);
+    trace!(target: "w10s_webscraper", "{} - Writing json", function_name!());
+
+    // Write the new json file
+    let json = make_image_json(images);
+    trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!());
+    fileio::write_json(&CONFIG.image_directory, json);
+    info!(target: "w10s_webscraper", "{} - Finished", function_name!());
+}
+
+pub fn est_time(vector: usize) -> String {
+    let est_time = (vector as f64 * 5.1) / 60 as f64;
+    if est_time > 120.0 {
+        format!("{} hours", est_time / 60.0)
+    } else {
+        format!("{} minutes", est_time)
+    }
+}
--- a/src/percentage.rs
+++ b/src/percentage.rs
@@ -0,0 +1,69 @@
+use function_name::named;
+use log::{debug, info};
+pub struct Percentage {
+    /*
+    This struct is used to track the percentage of a task that has been completed.
+    It is used to print progress to the console.
+    */
+    threshold: f32,
+    step_size: f32,
+    total: usize,
+    count: usize,
+    percent: f32,
+}
+
+impl Percentage {
+    #[named]
+    pub fn new(total: usize) -> Percentage {
+        let mut step = 0.1;
+        let mut threshold = 0.1;
+        if total == 0 {
+            debug!(
+                "fn {} - Percentage::new() called with total = 0",
+                function_name!()
+            );
+        }
+        // If there are less than 10 items, set the step size to whatever percentage of the total each item represents
+        if total < 10 {
+            step = 1.0 / total as f32;
+        }
+        // If there are greater than 100 items, set the step size and threshold to 5%
+        if total > 100 {
+            step = 0.05;
+            threshold = 0.05;
+        }
+        // If there are greater than 1000 items, set the step size and threshold to 1%
+        if total > 1000 {
+            step = 0.01;
+            threshold = 0.01;
+        }
+        Percentage {
+            threshold,
+            step_size: step,
+            total,
+            count: 0,
+            percent: 0.0,
+        }
+    }
+    pub fn get_percent(&self) -> f32 {
+        self.percent
+    }
+    pub fn get_total(&self) -> usize {
+        self.total
+    }
+    pub fn update(&mut self, fn_name: &str) {
+        if self.total == 0 {
+            info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name);
+        }
+        // Update the progress
+        self.count += 1;
+        // Calculate the percentage
+        let percent: f32 = self.count as f32 / self.total as f32;
+        // If the percentage is greater than the threshold, print the percentage
+        if percent >= self.threshold {
+            info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0);
+            // Update the threshold
+            self.threshold += self.step_size;
+        }
+    }
+}
--- a/src/scan.rs
+++ b/src/scan.rs
@@ -0,0 +1,53 @@
+use crate::{named, CONFIG, ImageBox, fileio};
+use log::info;
+
+#[named]
+pub fn scan(old_images: &mut Vec<ImageBox>) -> Vec<ImageBox> {
+    // Scan image directory for existing images
+    info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!());
+    let scanned_images: Vec<ImageBox> = fileio::read_images(&CONFIG.image_directory);
+    if scanned_images.len() == 0 {
+        info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!());
+        return old_images.to_vec();
+    }
+    let mut indexies_to_remove: Vec<usize> = Vec::new();
+    let mut pos: usize = 0;
+    // Determine the json entries that are not present in the directory
+    for image in &*old_images {
+        let mut is_present = false;
+        for scanned_image in &scanned_images {
+            if image.hash == scanned_image.hash {
+                is_present = true;
+            }
+        }
+        if !is_present && image.blacklisted == false {
+            indexies_to_remove.push(pos)
+        }
+        pos += 1;
+    }
+    // Remove the entries from the json
+    // Remove in reverse order to avoid index issues
+    indexies_to_remove.reverse();
+    for index in indexies_to_remove {
+        old_images.remove(index);
+    }
+    info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!());
+
+    // Add images that are in the directory, but not in the json
+    for image in &scanned_images {
+        let mut is_old = false;
+        for old_image in &*old_images {
+            if image.hash == old_image.hash {
+                is_old = true;
+                break;
+            }
+        }
+
+        if !is_old {
+            old_images.push(image.clone());
+        }
+    }
+    info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!());
+
+    old_images.to_vec()
+}
--- a/src/web.rs
+++ b/src/web.rs
@@ -0,0 +1,84 @@
+use crate::ImageBox;
+use crate::CONFIG;
+pub mod fetch;
+pub mod html;
+use crate::percentage::Percentage;
+use crate::{est_time, named};
+use log::{error, info};
+
+#[named]
+pub fn get_page_links(url: &str, old_images: &Vec<ImageBox>) -> Vec<String> {
+    info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!());
+    let html = fetch::fetch_html(url).unwrap_or_else(|error| {
+        panic!("Problem fetching primary page: {}", error);
+    });
+    let mut page_links: Vec<String> = html::extract_image_page_links(&html, &old_images);
+    if CONFIG.test == false {
+        // if debug is false, then we want to get all of the pages
+        let page_count = html::extract_page_count(html);
+        let mut percent = Percentage::new(page_count as usize);
+        info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count);
+        info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize));
+        let mut fully_skipped_page_count = 0;
+        for i in 2..page_count {
+            let url = format!("{}page/{}/", url, i);
+            let html = match fetch::fetch_html(&url) {
+                Ok(html) => html,
+                Err(error) => {
+                    error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
+                    continue;
+                }
+            };
+            let mut new_links: Vec<String> =
+                html::extract_image_page_links(&html, &old_images).to_vec();
+
+            // If we get no new links five pages in a row, then we can skip the rest of the pages
+            if new_links.len() == 0 {
+                fully_skipped_page_count += 1;
+            }
+            if new_links.len() > 0 {
+                fully_skipped_page_count = 0;
+            }
+            if fully_skipped_page_count > 5 {
+                info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!());
+                break;
+            }
+
+            page_links.append(&mut new_links);
+            percent.update(function_name!());
+        }
+    }
+
+    page_links
+}
+
+#[named]
+pub fn get_image_data(urls: Vec<String>) -> Vec<ImageBox> {
+    let mut percent = Percentage::new(urls.len());
+    info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len());
+    info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len()));
+    let mut images: Vec<ImageBox> = Vec::new();
+    for url in urls {
+        let html = match fetch::fetch_html(&url) {
+            Ok(html) => html,
+            Err(error) => {
+                error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
+                continue;
+            }
+        };
+        let image_link = html::extract_image_url(&html);
+        let image_title = html::extract_image_title(&html);
+        let image_date = html::extract_image_date(&html);
+        let image_hash = url.split("/").last().unwrap().to_string();
+        let image = ImageBox {
+            url: image_link,
+            date: image_date,
+            title: image_title,
+            hash: image_hash,
+            blacklisted: false,
+        };
+        images.push(image);
+        percent.update(function_name!());
+    }
+    images
+}
--- a/src/web/fetch.rs
+++ b/src/web/fetch.rs
@@ -0,0 +1,74 @@
+use crate::named;
+use lazy_static::lazy_static;
+use log::{debug, trace};
+use reqwest::Error;
+use scraper::Html;
+use std::sync::Mutex;
+use std::time::Instant;
+
+/*
+   do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes
+*/
+
+lazy_static! {
+    static ref LAST_REQUEST_MUTEX: Mutex<Option<Instant>> = Mutex::new(None);
+    static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500);
+}
+
+pub fn do_throttled_request(url: &str) -> Result<reqwest::blocking::Response, Error> {
+    fn delay() {
+        let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap();
+        let last_request = last_request_mutex.take();
+        //let now = Instant::now();
+
+        if let Some(last_request) = last_request {
+            let duration = last_request.duration_since(last_request);
+            if duration < *REQUEST_DELAY {
+                std::thread::sleep(*REQUEST_DELAY - duration);
+            }
+        }
+    }
+
+    // First request
+    delay();
+    let mut resp = reqwest::blocking::get(url);
+    LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
+    // Retry 5 times
+    if resp.is_err() {
+        for i in 0..5 {
+            delay();
+            resp = reqwest::blocking::get(url);
+            LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
+
+            if resp.is_ok() {
+                break;
+            }
+            if i == 4 {
+                return resp; // Return error after 5 tries
+            }
+        }
+    }
+    resp
+}
+
+#[named]
+pub fn fetch_html(url: &str) -> Result<scraper::Html, Error> {
+    trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url);
+    let resp = match do_throttled_request(url) {
+        Ok(resp) => resp,
+        Err(e) => {
+            debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url);
+            return Err(e);
+        }
+    };
+
+    let html = resp.text().unwrap();
+    let html = Html::parse_document(&html);
+    Ok(html)
+}
+
+pub fn fetch_image_bytes(url: &str) -> Result<Vec<u8>, reqwest::Error> {
+    let resp = do_throttled_request(url)?;
+    let bytes = resp.bytes()?;
+    Ok(bytes.to_vec())
+}
--- a/src/web/html.rs
+++ b/src/web/html.rs
@@ -0,0 +1,143 @@
+use chrono::{DateTime, Local};
+use log::{debug, error, warn};
+use scraper::{Html, Selector};
+
+use crate::{named, ImageBox};
+
+#[named]
+pub fn extract_page_count(html: Html) -> i32 {
+    let mut page_count: i32 = 0;
+    // select only the links that are page numbers
+    let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap();
+    let links = html.select(&selector);
+    // iterate over the links and find the highest page number
+    for link in links {
+        let href = link.value().attr("href").unwrap();
+        // get the last element of the link, which is the page number
+        let raw = href.split("/").collect::<Vec<&str>>();
+        let last = raw.last().unwrap();
+        let last = last.parse::<i32>().unwrap(); // cast the last element to an i32
+        if last > page_count {
+            page_count = last;
+        }
+    }
+    debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count);
+    page_count
+}
+
+#[named]
+pub fn extract_image_page_links(html: &Html, old_images: &Vec<ImageBox>) -> Vec<String> {
+    let mut page_links: Vec<String> = Vec::new();
+    let selector = Selector::parse(r#"a[href]"#).unwrap();
+    let links = html.select(&selector);
+    for link in links {
+        // get the href attribute
+        let href: String = link.value().attr("href").unwrap().to_string();
+        // get the hash in the link
+        let hash: String = href
+            .split("/")
+            .collect::<Vec<&str>>()
+            .last()
+            .unwrap()
+            .to_string();
+        // if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list
+        let image = old_images.iter().find(|&x| x.hash == hash);
+        if href.contains("/images/")
+            && href.contains("#respond") == false
+            && href.contains("#comments") == false
+            && page_links.contains(&href) == false
+            && image == None
+        {
+            page_links.push(href.to_string());
+        }
+    }
+    debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links);
+    page_links
+}
+
+pub fn extract_image_url(html: &Html) -> String {
+    // construct a CSS selector that will grab all of the image tags
+    // This selector is not the html snippets themselves, but rather an object that knows how to select them
+    let selector = Selector::parse("img").unwrap();
+    // use the selector to find all img tags in the document
+    let images = html.select(&selector);
+    // iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable
+    let mut output: String = String::new();
+    for image in images {
+        // get the raw src attribute of the image tag
+        let src = image.value().attr("src").unwrap();
+        // output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576"
+        if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") {
+            //println!("{}", src);
+            // split the src attribute into a vector of strings, using the "-" character as the delimiter
+            let tempvec = src.split("-").collect::<Vec<&str>>();
+            // create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end
+            let mut temp_s = String::new();
+            temp_s.push_str(tempvec[0]);
+            // this keeps the '-' in ".com/wp-content/upl"
+            temp_s.push_str("-");
+            temp_s.push_str(tempvec[1]);
+            temp_s.push_str(".jpg");
+            output = temp_s;
+            //print!("{}", output)
+        }
+    }
+    output
+}
+
+#[named]
+pub fn extract_image_date(html: &Html) -> DateTime<Local> {
+    let selector = Selector::parse(r#"span[class="date"]"#).unwrap();
+    let html_dates = html.select(&selector);
+    let mut dates: Vec<String> = Vec::new();
+    for date in html_dates {
+        let date = date.text().collect::<Vec<_>>();
+        dates.push(date[0].to_string())
+    }
+    if dates.len() > 1 {
+        warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!());
+    }
+    // date comes out of the html as "2020-01-01", but we need to add the time and timezone to it
+    // so we can parse it into a DateTime object
+    let mut datetime: String = dates[0].to_string();
+    datetime.push_str("  12:00:00 -0500");
+    let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") {
+        Ok(datetime) => datetime.with_timezone(&Local),
+        Err(e) => {
+            error!(
+                target: "w10s_webscraper",
+                "{} - Error parsing date, using local now: {}",
+                function_name!(),
+                e
+            );
+            Local::now()
+        }
+    };
+    datetime
+}
+
+#[named]
+pub fn extract_image_title(html: &Html) -> String {
+    let selector = Selector::parse(r#"title"#).unwrap();
+    let titles = html.select(&selector);
+    let mut output: Vec<String> = Vec::new();
+    for title in titles {
+        let title = title.text().collect::<Vec<_>>();
+        output.push(title[0].to_string())
+    }
+    if output.len() > 1 {
+        warn!(
+            target: "w10s_webscraper",
+            "{} - More than one title found. Using the first one ({})",
+            function_name!(),
+            output[0]
+        );
+    }
+
+    output[0]
+        .split(" | ")
+        .collect::<Vec<&str>>()
+        .first()
+        .unwrap()
+        .to_string()
+}