Initial commit

This commit is contained in:
NSawyer
2024-02-08 12:53:40 -05:00
commit d724975359
21 changed files with 1090 additions and 0 deletions

108
src/argparse.rs Normal file
View File

@@ -0,0 +1,108 @@
use crate::{json_code, named};
use clap::Parser;
use std::path::PathBuf;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[derive(Debug)]
pub struct Cli {
#[arg(short, long, value_name = "FILE")]
pub folder: Option<PathBuf>,
#[arg(short, long, value_name = "URL")]
pub url: Option<String>,
#[arg(short, long)]
pub scan: bool,
// Turn debugging information on
#[arg(short, long, action = clap::ArgAction::Count)]
pub verbosity: u8,
#[arg(short, long)]
pub download: bool,
// Turn testing mode on
#[arg(short, long)]
pub test: bool,
#[arg(short, long)]
pub create_config_file: bool,
}
#[derive(Debug)]
pub struct Config {
pub verbosity: u8,
pub url: String,
pub scan: bool,
pub image_directory: PathBuf,
pub download: bool,
pub test: bool,
}
#[named]
pub fn parse_args() -> Config {
// If config file is present, read it and use it to override the command line arguments
if let Ok(json) = std::fs::read_to_string("config.json") {
let json: json::JsonValue = match json::parse(&json) {
Ok(json) => json,
Err(e) => {
println!("fn {} - Error parsing config.json: {}", function_name!(), e);
std::process::exit(1);
}
};
let config = json_code::parse_config_json(json);
return config;
}
// Parse command line arguments
let cli = Cli::parse();
let mut config = Config {
download: false,
url: String::from(""),
scan: false,
image_directory: PathBuf::from("images"),
test: false,
verbosity: 0,
};
if cli.scan == false && cli.download == false {
println!(
"fn {} - You must specify either --scan, --download, or --help",
function_name!()
);
std::process::exit(1);
}
if let Some(path) = cli.folder.as_deref() {
config.image_directory = path.to_path_buf();
}
if cli.scan {
config.scan = cli.scan;
}
if let Some(url) = cli.url.as_deref() {
config.url = url.to_string();
}
if cli.test {
config.test = cli.test;
}
if cli.download {
config.download = cli.download;
}
// If the image directory is the default, create it if it doesn't exist
if config.image_directory.eq(&PathBuf::from("images")) {
if config.image_directory.exists() == false {
std::fs::create_dir(&config.image_directory).unwrap();
}
}
// If create_config_file is true, create the config file and exit
if cli.create_config_file {
let json = json_code::make_config_json(config);
let json = json.pretty(2);
std::fs::write("config.json", json).unwrap();
println!("fn {} - Created config file", function_name!());
std::process::exit(0);
}
config
}

36
src/download.rs Normal file
View File

@@ -0,0 +1,36 @@
use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG};
use log::{error, info, warn};
#[named]
pub fn download_images(old_images: &Vec<ImageBox>) -> Vec<ImageBox> {
info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!());
// Get the page links from the website
let page_links = web::get_page_links(&CONFIG.url, &old_images);
// Get the image data from each page of the website, keeping data on only the new images
let images: Vec<ImageBox> = web::get_image_data(page_links);
let mut percent = percentage::Percentage::new(images.len() as usize);
info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len());
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len()));
// Download and write each new image
for image in &images {
match fileio::write_image(&image, &CONFIG.image_directory) {
Ok(_) => {}
Err(error) => {
match error.kind() {
std::io::ErrorKind::AlreadyExists => {
warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash);
}
std::io::ErrorKind::Other => {
warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error);
}
_ => {
error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error);
} // No need "continue", as the image is not written
}
}
};
percent.update(function_name!());
}
images
}

137
src/fileio.rs Normal file
View File

@@ -0,0 +1,137 @@
use chrono::Local;
use filetime_creation::{set_file_ctime, FileTime};
use json::JsonValue;
use log::{debug, error, info, trace};
use md5;
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::Error;
use std::path::Path;
use crate::named;
use crate::percentage::Percentage;
use crate::web::fetch::fetch_image_bytes;
use crate::ImageBox;
#[named]
pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> {
// Join the image_directory path with the image title and .jpg
let image_path = image_directory.join(&image.title).with_extension(".jpg");
// Create the image file
let mut out = File::create(&image_path)?;
// Fetch the image bytes
let mut content = match fetch_image_bytes(&image.url) {
Ok(content) => content,
Err(error) => {
return Err(Error::new(
std::io::ErrorKind::Other,
format!("Error fetching image bytes from {}: {}", &image.url, error),
));
}
};
// Write the image bytes to the image file
out.write_all(&mut content)?;
// Next we need to set the creation date of the file to the date of the image
let image_time = FileTime::from_unix_time(image.date.timestamp(), 0);
set_file_ctime(&image_path, image_time)?;
trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title);
Ok(())
}
#[named]
pub fn read_images(image_directory: &Path) -> Vec<ImageBox> {
// reads the image directory and returns a vector of ImageBox structs with hashes from the actual images
let mut images: Vec<ImageBox> = Vec::new();
// Iterate over the files in the image directory
let files = fs::read_dir(&image_directory)
.unwrap_or_else(|error| {
error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error);
panic!(
"{} - Error reading image directory: {}",
function_name!(),
error
)
})
.collect::<Vec<_>>();
let mut percent = Percentage::new(files.len());
info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len());
for file in files {
let file = match file {
Ok(file) => file,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error);
continue;
}
};
// If the file is a jpg, read the hash from the file and add it to the vector
let jpg = match file.path().extension() {
Some(str) => {
if str == "jpg" {
true
} else {
trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display());
false
}
}
_ => {
trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display());
false
}
};
if jpg {
let hash = md5::compute(std::fs::read(file.path()).unwrap());
let image_box = ImageBox {
url: "scanned".to_string(),
date: Local::now(),
title: file.file_name().into_string().unwrap(),
hash: format!("{:x}", hash),
blacklisted: false,
};
images.push(image_box);
trace!(
target: "w10s_webscraper",
"fn {} - Image {} read",
function_name!(),
&file.file_name().into_string().unwrap()
);
}
percent.update(function_name!());
}
images
}
#[named]
pub fn read_json(image_directory: &Path) -> Result<JsonValue, Error> {
let path = image_directory.join("hashes.json");
// Read the json from the file and return it
trace!(
target: "w10s_webscraper",
"fn {} - Reading json file, expect confirmation",
function_name!()
);
let mut file = File::open(path)?;
trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!());
let mut buf = String::new();
file.read_to_string(&mut buf)?;
let json = json::parse(&buf).unwrap();
debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!());
Ok(json)
}
#[named]
pub fn write_json(path: &Path, json: JsonValue) {
let json = json.pretty(2);
let path = path.join("hashes.json");
// Create file, and overwrite it if it exists
let mut file = match File::create(path) {
Ok(file) => file,
Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error),
};
match file.write_all(json.as_bytes()) {
Ok(_) => (),
Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error),
};
debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!());
}

41
src/image_data.rs Normal file
View File

@@ -0,0 +1,41 @@
use chrono::{DateTime, Local};
pub struct ImageBox {
pub url: String,
pub date: DateTime<Local>,
pub title: String,
pub hash: String,
pub blacklisted: bool,
}
impl std::fmt::Display for ImageBox {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
f,
"{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}",
self.url,
self.date.date_naive(),
self.title,
self.hash,
self.blacklisted
)
}
}
impl PartialEq for ImageBox {
fn eq(&self, other: &Self) -> bool {
self.hash == other.hash
}
}
impl Clone for ImageBox {
fn clone(&self) -> Self {
ImageBox {
url: self.url.clone(),
date: self.date.clone(),
title: self.title.clone(),
hash: self.hash.clone(),
blacklisted: self.blacklisted.clone(),
}
}
}

107
src/json_code.rs Normal file
View File

@@ -0,0 +1,107 @@
use std::path::PathBuf;
use crate::{image_data::ImageBox, named, Config};
use chrono::DateTime;
use json::{array, object, JsonValue};
use log::{debug, trace, warn};
/*
The json code is pretty much all just converting structs to json and back again.
*/
#[named]
pub fn make_config_json(config: Config) -> JsonValue {
debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!());
let download = config.download;
let url = &config.url;
let scan = config.scan;
let image_directory = &config.image_directory;
let image_directory = image_directory.to_str().unwrap();
let test = config.test;
let verbosity = config.verbosity;
let json = object! {
"download": download,
"url": url.to_string(),
"scan": scan,
"image_directory": image_directory,
"test": test,
"verbosity": verbosity,
};
json
}
#[named]
pub fn parse_config_json(json: JsonValue) -> Config {
debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!());
let download = json["download"].as_bool().unwrap();
let url = json["url"].to_string();
let scan = json["scan"].as_bool().unwrap();
let image_directory = json["image_directory"].to_string();
let test = json["test"].as_bool().unwrap();
let verbosity = json["verbosity"].as_u8().unwrap();
let config = Config {
download,
url,
scan,
image_directory: PathBuf::from(image_directory),
test,
verbosity,
};
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
config
}
#[named]
pub fn make_image_json(images: Vec<ImageBox>) -> JsonValue {
trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!());
let mut json = object! {
"info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#,
"images": array![]
};
for image in images {
let image_json = object! {
"hash": image.hash,
"date_added": image.date.to_rfc2822(),
"url": image.url,
"title": image.title,
"blacklisted": image.blacklisted,
};
json["images"].push(image_json).unwrap();
}
trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!());
json
}
#[named]
pub fn parse_image_json(json: JsonValue) -> Vec<ImageBox> {
debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!());
let mut images: Vec<ImageBox> = Vec::new();
for image in json["images"].members() {
let image_box = ImageBox {
url: image["url"].to_string(),
date: DateTime::from(
match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) {
Ok(date) => date,
Err(error) => {
warn!(
target: "w10s_webscraper",
"{} - Error parsing date, defaulting to unix 0: {}",
function_name!(),
error
);
DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap()
}
},
),
title: image["title"].to_string(),
hash: image["hash"].to_string(),
blacklisted: image["blacklisted"].as_bool().unwrap(),
};
images.push(image_box);
}
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
images
}

56
src/logging.rs Normal file
View File

@@ -0,0 +1,56 @@
use crate::CONFIG;
use log::LevelFilter;
use log4rs::append::console::ConsoleAppender;
use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller;
use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger;
use log4rs::append::rolling_file::policy::compound::CompoundPolicy;
use log4rs::append::rolling_file::RollingFileAppender;
use log4rs::config::{Appender, Config, Logger, Root};
use log4rs::encode::pattern::PatternEncoder;
use log4rs::filter::threshold::ThresholdFilter;
//use crate::CONFIG;
pub fn initialize_logging() -> log4rs::Handle {
// Initialize logging
let level: LevelFilter = match &CONFIG.verbosity {
0 => LevelFilter::Info,
1 => LevelFilter::Debug,
2 => LevelFilter::Trace,
_ => LevelFilter::Info,
};
let stdout = ConsoleAppender::builder()
.encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}")))
.build(); // This appender is filtered, but only later
let roller = FixedWindowRoller::builder()
.build("log/my{}.log", 50)
.unwrap();
let policy: CompoundPolicy =
CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller));
let file_logger = RollingFileAppender::builder()
.encoder(Box::new(PatternEncoder::new(
"{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}",
)))
.build("log/my.log", Box::new(policy))
.unwrap();
let config = Config::builder()
.appender(
Appender::builder()
.filter(Box::new(ThresholdFilter::new(level))) // This is the filter
.build("stdout", Box::new(stdout)),
)
.appender(Appender::builder().build("file_logger", Box::new(file_logger)))
.logger(
Logger::builder()
.additive(false) // If additive is true, you get double output from the stdout appender
.appender("stdout")
.appender("file_logger")
.build("w10s_webscraper", LevelFilter::Trace),
)
.build(Root::builder().appender("stdout").build(LevelFilter::Warn))
.unwrap();
let handle = log4rs::init_config(config).unwrap();
handle
}

82
src/main.rs Normal file
View File

@@ -0,0 +1,82 @@
use argparse::Config;
pub use function_name::named;
use json::JsonValue;
use lazy_static::lazy_static;
pub use log::{debug, error, info, trace, warn};
pub use log::{Level, LevelFilter};
mod download;
mod fileio;
mod image_data;
mod logging;
pub mod percentage;
mod scan;
pub use image_data::ImageBox;
pub mod argparse;
mod json_code;
mod web;
use json_code::make_image_json;
lazy_static! {
static ref CONFIG: Config = argparse::parse_args();
}
#[named]
fn main() {
logging::initialize_logging();
info!(target: "w10s_webscraper", "{} - Starting", function_name!());
trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!());
// Load json database of existing images in the image directory
let json = match fileio::read_json(&CONFIG.image_directory) {
Ok(json) => json,
Err(error) => {
if error.kind() == std::io::ErrorKind::NotFound {
warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!());
JsonValue::new_object()
} else {
error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error);
panic!("{} - Error reading json file: {}", function_name!(), error);
}
}
};
// Parse json into a vector of ImageBox structs
let mut old_images: Vec<ImageBox> = json_code::parse_image_json(json);
trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!());
// Create an empty vector of ImageBox structs to hold the new images
let mut images: Vec<ImageBox> = Vec::new();
trace!(target: "w10s_webscraper",
"{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both",
function_name!(),
CONFIG.scan
);
// Determine if we need to download images, scan the image directory, or both
if CONFIG.scan {
old_images = scan::scan(&mut old_images);
}
if CONFIG.download {
images = download::download_images(&old_images);
}
trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!());
// Merge the old and new image data
images.append(&mut old_images);
trace!(target: "w10s_webscraper", "{} - Writing json", function_name!());
// Write the new json file
let json = make_image_json(images);
trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!());
fileio::write_json(&CONFIG.image_directory, json);
info!(target: "w10s_webscraper", "{} - Finished", function_name!());
}
pub fn est_time(vector: usize) -> String {
let est_time = (vector as f64 * 5.1) / 60 as f64;
if est_time > 120.0 {
format!("{} hours", est_time / 60.0)
} else {
format!("{} minutes", est_time)
}
}

69
src/percentage.rs Normal file
View File

@@ -0,0 +1,69 @@
use function_name::named;
use log::{debug, info};
pub struct Percentage {
/*
This struct is used to track the percentage of a task that has been completed.
It is used to print progress to the console.
*/
threshold: f32,
step_size: f32,
total: usize,
count: usize,
percent: f32,
}
impl Percentage {
#[named]
pub fn new(total: usize) -> Percentage {
let mut step = 0.1;
let mut threshold = 0.1;
if total == 0 {
debug!(
"fn {} - Percentage::new() called with total = 0",
function_name!()
);
}
// If there are less than 10 items, set the step size to whatever percentage of the total each item represents
if total < 10 {
step = 1.0 / total as f32;
}
// If there are greater than 100 items, set the step size and threshold to 5%
if total > 100 {
step = 0.05;
threshold = 0.05;
}
// If there are greater than 1000 items, set the step size and threshold to 1%
if total > 1000 {
step = 0.01;
threshold = 0.01;
}
Percentage {
threshold,
step_size: step,
total,
count: 0,
percent: 0.0,
}
}
pub fn get_percent(&self) -> f32 {
self.percent
}
pub fn get_total(&self) -> usize {
self.total
}
pub fn update(&mut self, fn_name: &str) {
if self.total == 0 {
info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name);
}
// Update the progress
self.count += 1;
// Calculate the percentage
let percent: f32 = self.count as f32 / self.total as f32;
// If the percentage is greater than the threshold, print the percentage
if percent >= self.threshold {
info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0);
// Update the threshold
self.threshold += self.step_size;
}
}
}

53
src/scan.rs Normal file
View File

@@ -0,0 +1,53 @@
use crate::{named, CONFIG, ImageBox, fileio};
use log::info;
#[named]
pub fn scan(old_images: &mut Vec<ImageBox>) -> Vec<ImageBox> {
// Scan image directory for existing images
info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!());
let scanned_images: Vec<ImageBox> = fileio::read_images(&CONFIG.image_directory);
if scanned_images.len() == 0 {
info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!());
return old_images.to_vec();
}
let mut indexies_to_remove: Vec<usize> = Vec::new();
let mut pos: usize = 0;
// Determine the json entries that are not present in the directory
for image in &*old_images {
let mut is_present = false;
for scanned_image in &scanned_images {
if image.hash == scanned_image.hash {
is_present = true;
}
}
if !is_present && image.blacklisted == false {
indexies_to_remove.push(pos)
}
pos += 1;
}
// Remove the entries from the json
// Remove in reverse order to avoid index issues
indexies_to_remove.reverse();
for index in indexies_to_remove {
old_images.remove(index);
}
info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!());
// Add images that are in the directory, but not in the json
for image in &scanned_images {
let mut is_old = false;
for old_image in &*old_images {
if image.hash == old_image.hash {
is_old = true;
break;
}
}
if !is_old {
old_images.push(image.clone());
}
}
info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!());
old_images.to_vec()
}

84
src/web.rs Normal file
View File

@@ -0,0 +1,84 @@
use crate::ImageBox;
use crate::CONFIG;
pub mod fetch;
pub mod html;
use crate::percentage::Percentage;
use crate::{est_time, named};
use log::{error, info};
#[named]
pub fn get_page_links(url: &str, old_images: &Vec<ImageBox>) -> Vec<String> {
info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!());
let html = fetch::fetch_html(url).unwrap_or_else(|error| {
panic!("Problem fetching primary page: {}", error);
});
let mut page_links: Vec<String> = html::extract_image_page_links(&html, &old_images);
if CONFIG.test == false {
// if debug is false, then we want to get all of the pages
let page_count = html::extract_page_count(html);
let mut percent = Percentage::new(page_count as usize);
info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count);
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize));
let mut fully_skipped_page_count = 0;
for i in 2..page_count {
let url = format!("{}page/{}/", url, i);
let html = match fetch::fetch_html(&url) {
Ok(html) => html,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
continue;
}
};
let mut new_links: Vec<String> =
html::extract_image_page_links(&html, &old_images).to_vec();
// If we get no new links five pages in a row, then we can skip the rest of the pages
if new_links.len() == 0 {
fully_skipped_page_count += 1;
}
if new_links.len() > 0 {
fully_skipped_page_count = 0;
}
if fully_skipped_page_count > 5 {
info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!());
break;
}
page_links.append(&mut new_links);
percent.update(function_name!());
}
}
page_links
}
#[named]
pub fn get_image_data(urls: Vec<String>) -> Vec<ImageBox> {
let mut percent = Percentage::new(urls.len());
info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len());
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len()));
let mut images: Vec<ImageBox> = Vec::new();
for url in urls {
let html = match fetch::fetch_html(&url) {
Ok(html) => html,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
continue;
}
};
let image_link = html::extract_image_url(&html);
let image_title = html::extract_image_title(&html);
let image_date = html::extract_image_date(&html);
let image_hash = url.split("/").last().unwrap().to_string();
let image = ImageBox {
url: image_link,
date: image_date,
title: image_title,
hash: image_hash,
blacklisted: false,
};
images.push(image);
percent.update(function_name!());
}
images
}

74
src/web/fetch.rs Normal file
View File

@@ -0,0 +1,74 @@
use crate::named;
use lazy_static::lazy_static;
use log::{debug, trace};
use reqwest::Error;
use scraper::Html;
use std::sync::Mutex;
use std::time::Instant;
/*
do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes
*/
lazy_static! {
static ref LAST_REQUEST_MUTEX: Mutex<Option<Instant>> = Mutex::new(None);
static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500);
}
pub fn do_throttled_request(url: &str) -> Result<reqwest::blocking::Response, Error> {
fn delay() {
let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap();
let last_request = last_request_mutex.take();
//let now = Instant::now();
if let Some(last_request) = last_request {
let duration = last_request.duration_since(last_request);
if duration < *REQUEST_DELAY {
std::thread::sleep(*REQUEST_DELAY - duration);
}
}
}
// First request
delay();
let mut resp = reqwest::blocking::get(url);
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
// Retry 5 times
if resp.is_err() {
for i in 0..5 {
delay();
resp = reqwest::blocking::get(url);
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
if resp.is_ok() {
break;
}
if i == 4 {
return resp; // Return error after 5 tries
}
}
}
resp
}
#[named]
pub fn fetch_html(url: &str) -> Result<scraper::Html, Error> {
trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url);
let resp = match do_throttled_request(url) {
Ok(resp) => resp,
Err(e) => {
debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url);
return Err(e);
}
};
let html = resp.text().unwrap();
let html = Html::parse_document(&html);
Ok(html)
}
pub fn fetch_image_bytes(url: &str) -> Result<Vec<u8>, reqwest::Error> {
let resp = do_throttled_request(url)?;
let bytes = resp.bytes()?;
Ok(bytes.to_vec())
}

143
src/web/html.rs Normal file
View File

@@ -0,0 +1,143 @@
use chrono::{DateTime, Local};
use log::{debug, error, warn};
use scraper::{Html, Selector};
use crate::{named, ImageBox};
#[named]
pub fn extract_page_count(html: Html) -> i32 {
let mut page_count: i32 = 0;
// select only the links that are page numbers
let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap();
let links = html.select(&selector);
// iterate over the links and find the highest page number
for link in links {
let href = link.value().attr("href").unwrap();
// get the last element of the link, which is the page number
let raw = href.split("/").collect::<Vec<&str>>();
let last = raw.last().unwrap();
let last = last.parse::<i32>().unwrap(); // cast the last element to an i32
if last > page_count {
page_count = last;
}
}
debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count);
page_count
}
#[named]
pub fn extract_image_page_links(html: &Html, old_images: &Vec<ImageBox>) -> Vec<String> {
let mut page_links: Vec<String> = Vec::new();
let selector = Selector::parse(r#"a[href]"#).unwrap();
let links = html.select(&selector);
for link in links {
// get the href attribute
let href: String = link.value().attr("href").unwrap().to_string();
// get the hash in the link
let hash: String = href
.split("/")
.collect::<Vec<&str>>()
.last()
.unwrap()
.to_string();
// if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list
let image = old_images.iter().find(|&x| x.hash == hash);
if href.contains("/images/")
&& href.contains("#respond") == false
&& href.contains("#comments") == false
&& page_links.contains(&href) == false
&& image == None
{
page_links.push(href.to_string());
}
}
debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links);
page_links
}
pub fn extract_image_url(html: &Html) -> String {
// construct a CSS selector that will grab all of the image tags
// This selector is not the html snippets themselves, but rather an object that knows how to select them
let selector = Selector::parse("img").unwrap();
// use the selector to find all img tags in the document
let images = html.select(&selector);
// iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable
let mut output: String = String::new();
for image in images {
// get the raw src attribute of the image tag
let src = image.value().attr("src").unwrap();
// output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576"
if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") {
//println!("{}", src);
// split the src attribute into a vector of strings, using the "-" character as the delimiter
let tempvec = src.split("-").collect::<Vec<&str>>();
// create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end
let mut temp_s = String::new();
temp_s.push_str(tempvec[0]);
// this keeps the '-' in ".com/wp-content/upl"
temp_s.push_str("-");
temp_s.push_str(tempvec[1]);
temp_s.push_str(".jpg");
output = temp_s;
//print!("{}", output)
}
}
output
}
#[named]
pub fn extract_image_date(html: &Html) -> DateTime<Local> {
let selector = Selector::parse(r#"span[class="date"]"#).unwrap();
let html_dates = html.select(&selector);
let mut dates: Vec<String> = Vec::new();
for date in html_dates {
let date = date.text().collect::<Vec<_>>();
dates.push(date[0].to_string())
}
if dates.len() > 1 {
warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!());
}
// date comes out of the html as "2020-01-01", but we need to add the time and timezone to it
// so we can parse it into a DateTime object
let mut datetime: String = dates[0].to_string();
datetime.push_str(" 12:00:00 -0500");
let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") {
Ok(datetime) => datetime.with_timezone(&Local),
Err(e) => {
error!(
target: "w10s_webscraper",
"{} - Error parsing date, using local now: {}",
function_name!(),
e
);
Local::now()
}
};
datetime
}
#[named]
pub fn extract_image_title(html: &Html) -> String {
let selector = Selector::parse(r#"title"#).unwrap();
let titles = html.select(&selector);
let mut output: Vec<String> = Vec::new();
for title in titles {
let title = title.text().collect::<Vec<_>>();
output.push(title[0].to_string())
}
if output.len() > 1 {
warn!(
target: "w10s_webscraper",
"{} - More than one title found. Using the first one ({})",
function_name!(),
output[0]
);
}
output[0]
.split(" | ")
.collect::<Vec<&str>>()
.first()
.unwrap()
.to_string()
}