Initial commit

This commit is contained in:
NSawyer
2024-02-08 12:53:40 -05:00
commit d724975359
21 changed files with 1090 additions and 0 deletions

2
.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto

14
.gitignore vendored Normal file
View File

@@ -0,0 +1,14 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

5
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"rust-analyzer.linkedProjects": [
".\\Cargo.toml"
]
}

35
Cargo.toml Normal file
View File

@@ -0,0 +1,35 @@
[package]
author = "Nayan Sawyer"
name = "w10s_webscraper"
version = "0.1.0"
description = "----\n\nAn example webscraper\nfolder defaults to current directory"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
chrono = { version = "0.4.26", features = ["clock"] }
clap = { version = "4.3.9", features = ["derive"] }
env_logger = "0.10.0"
filetime_creation = "0.1.5"
function_name = "0.3.0"
hex-literal = "0.4.1"
json = "0.12.4"
lazy_static = "1.4.0"
log = "0.4.19"
log4rs = "1.2.0"
md5 = "0.7.0"
reqwest = { version = "0.11.18", features = ["blocking"] }
scraper = "0.16.0"
serde-value = "0.7.0"
[profile.dev]
opt-level = 0
debug = true
debug-assertions = true
overflow-checks = true
lto = false
panic = 'unwind'
incremental = true
codegen-units = 256
rpath = false

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 NSawyer
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

2
README.md Normal file
View File

@@ -0,0 +1,2 @@
# webscraper_clean
An example webscraper written in Rust

7
config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"download": true,
"url": "",
"scan": true,
"image_directory": "images",
"test": true
}

0
log/my.log Normal file
View File

14
notes.txt Normal file
View File

@@ -0,0 +1,14 @@
arguments
debug
target directory
scan directory
-vertical
File format
------------
name:
hash:
date_added:
blacklist:

108
src/argparse.rs Normal file
View File

@@ -0,0 +1,108 @@
use crate::{json_code, named};
use clap::Parser;
use std::path::PathBuf;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[derive(Debug)]
pub struct Cli {
#[arg(short, long, value_name = "FILE")]
pub folder: Option<PathBuf>,
#[arg(short, long, value_name = "URL")]
pub url: Option<String>,
#[arg(short, long)]
pub scan: bool,
// Turn debugging information on
#[arg(short, long, action = clap::ArgAction::Count)]
pub verbosity: u8,
#[arg(short, long)]
pub download: bool,
// Turn testing mode on
#[arg(short, long)]
pub test: bool,
#[arg(short, long)]
pub create_config_file: bool,
}
#[derive(Debug)]
pub struct Config {
pub verbosity: u8,
pub url: String,
pub scan: bool,
pub image_directory: PathBuf,
pub download: bool,
pub test: bool,
}
#[named]
pub fn parse_args() -> Config {
// If config file is present, read it and use it to override the command line arguments
if let Ok(json) = std::fs::read_to_string("config.json") {
let json: json::JsonValue = match json::parse(&json) {
Ok(json) => json,
Err(e) => {
println!("fn {} - Error parsing config.json: {}", function_name!(), e);
std::process::exit(1);
}
};
let config = json_code::parse_config_json(json);
return config;
}
// Parse command line arguments
let cli = Cli::parse();
let mut config = Config {
download: false,
url: String::from(""),
scan: false,
image_directory: PathBuf::from("images"),
test: false,
verbosity: 0,
};
if cli.scan == false && cli.download == false {
println!(
"fn {} - You must specify either --scan, --download, or --help",
function_name!()
);
std::process::exit(1);
}
if let Some(path) = cli.folder.as_deref() {
config.image_directory = path.to_path_buf();
}
if cli.scan {
config.scan = cli.scan;
}
if let Some(url) = cli.url.as_deref() {
config.url = url.to_string();
}
if cli.test {
config.test = cli.test;
}
if cli.download {
config.download = cli.download;
}
// If the image directory is the default, create it if it doesn't exist
if config.image_directory.eq(&PathBuf::from("images")) {
if config.image_directory.exists() == false {
std::fs::create_dir(&config.image_directory).unwrap();
}
}
// If create_config_file is true, create the config file and exit
if cli.create_config_file {
let json = json_code::make_config_json(config);
let json = json.pretty(2);
std::fs::write("config.json", json).unwrap();
println!("fn {} - Created config file", function_name!());
std::process::exit(0);
}
config
}

36
src/download.rs Normal file
View File

@@ -0,0 +1,36 @@
use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG};
use log::{error, info, warn};
#[named]
pub fn download_images(old_images: &Vec<ImageBox>) -> Vec<ImageBox> {
info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!());
// Get the page links from the website
let page_links = web::get_page_links(&CONFIG.url, &old_images);
// Get the image data from each page of the website, keeping data on only the new images
let images: Vec<ImageBox> = web::get_image_data(page_links);
let mut percent = percentage::Percentage::new(images.len() as usize);
info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len());
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len()));
// Download and write each new image
for image in &images {
match fileio::write_image(&image, &CONFIG.image_directory) {
Ok(_) => {}
Err(error) => {
match error.kind() {
std::io::ErrorKind::AlreadyExists => {
warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash);
}
std::io::ErrorKind::Other => {
warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error);
}
_ => {
error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error);
} // No need "continue", as the image is not written
}
}
};
percent.update(function_name!());
}
images
}

137
src/fileio.rs Normal file
View File

@@ -0,0 +1,137 @@
use chrono::Local;
use filetime_creation::{set_file_ctime, FileTime};
use json::JsonValue;
use log::{debug, error, info, trace};
use md5;
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::Error;
use std::path::Path;
use crate::named;
use crate::percentage::Percentage;
use crate::web::fetch::fetch_image_bytes;
use crate::ImageBox;
#[named]
pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> {
// Join the image_directory path with the image title and .jpg
let image_path = image_directory.join(&image.title).with_extension(".jpg");
// Create the image file
let mut out = File::create(&image_path)?;
// Fetch the image bytes
let mut content = match fetch_image_bytes(&image.url) {
Ok(content) => content,
Err(error) => {
return Err(Error::new(
std::io::ErrorKind::Other,
format!("Error fetching image bytes from {}: {}", &image.url, error),
));
}
};
// Write the image bytes to the image file
out.write_all(&mut content)?;
// Next we need to set the creation date of the file to the date of the image
let image_time = FileTime::from_unix_time(image.date.timestamp(), 0);
set_file_ctime(&image_path, image_time)?;
trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title);
Ok(())
}
#[named]
pub fn read_images(image_directory: &Path) -> Vec<ImageBox> {
// reads the image directory and returns a vector of ImageBox structs with hashes from the actual images
let mut images: Vec<ImageBox> = Vec::new();
// Iterate over the files in the image directory
let files = fs::read_dir(&image_directory)
.unwrap_or_else(|error| {
error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error);
panic!(
"{} - Error reading image directory: {}",
function_name!(),
error
)
})
.collect::<Vec<_>>();
let mut percent = Percentage::new(files.len());
info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len());
for file in files {
let file = match file {
Ok(file) => file,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error);
continue;
}
};
// If the file is a jpg, read the hash from the file and add it to the vector
let jpg = match file.path().extension() {
Some(str) => {
if str == "jpg" {
true
} else {
trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display());
false
}
}
_ => {
trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display());
false
}
};
if jpg {
let hash = md5::compute(std::fs::read(file.path()).unwrap());
let image_box = ImageBox {
url: "scanned".to_string(),
date: Local::now(),
title: file.file_name().into_string().unwrap(),
hash: format!("{:x}", hash),
blacklisted: false,
};
images.push(image_box);
trace!(
target: "w10s_webscraper",
"fn {} - Image {} read",
function_name!(),
&file.file_name().into_string().unwrap()
);
}
percent.update(function_name!());
}
images
}
#[named]
pub fn read_json(image_directory: &Path) -> Result<JsonValue, Error> {
let path = image_directory.join("hashes.json");
// Read the json from the file and return it
trace!(
target: "w10s_webscraper",
"fn {} - Reading json file, expect confirmation",
function_name!()
);
let mut file = File::open(path)?;
trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!());
let mut buf = String::new();
file.read_to_string(&mut buf)?;
let json = json::parse(&buf).unwrap();
debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!());
Ok(json)
}
#[named]
pub fn write_json(path: &Path, json: JsonValue) {
let json = json.pretty(2);
let path = path.join("hashes.json");
// Create file, and overwrite it if it exists
let mut file = match File::create(path) {
Ok(file) => file,
Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error),
};
match file.write_all(json.as_bytes()) {
Ok(_) => (),
Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error),
};
debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!());
}

41
src/image_data.rs Normal file
View File

@@ -0,0 +1,41 @@
use chrono::{DateTime, Local};
pub struct ImageBox {
pub url: String,
pub date: DateTime<Local>,
pub title: String,
pub hash: String,
pub blacklisted: bool,
}
impl std::fmt::Display for ImageBox {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
f,
"{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}",
self.url,
self.date.date_naive(),
self.title,
self.hash,
self.blacklisted
)
}
}
impl PartialEq for ImageBox {
fn eq(&self, other: &Self) -> bool {
self.hash == other.hash
}
}
impl Clone for ImageBox {
fn clone(&self) -> Self {
ImageBox {
url: self.url.clone(),
date: self.date.clone(),
title: self.title.clone(),
hash: self.hash.clone(),
blacklisted: self.blacklisted.clone(),
}
}
}

107
src/json_code.rs Normal file
View File

@@ -0,0 +1,107 @@
use std::path::PathBuf;
use crate::{image_data::ImageBox, named, Config};
use chrono::DateTime;
use json::{array, object, JsonValue};
use log::{debug, trace, warn};
/*
The json code is pretty much all just converting structs to json and back again.
*/
#[named]
pub fn make_config_json(config: Config) -> JsonValue {
debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!());
let download = config.download;
let url = &config.url;
let scan = config.scan;
let image_directory = &config.image_directory;
let image_directory = image_directory.to_str().unwrap();
let test = config.test;
let verbosity = config.verbosity;
let json = object! {
"download": download,
"url": url.to_string(),
"scan": scan,
"image_directory": image_directory,
"test": test,
"verbosity": verbosity,
};
json
}
#[named]
pub fn parse_config_json(json: JsonValue) -> Config {
debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!());
let download = json["download"].as_bool().unwrap();
let url = json["url"].to_string();
let scan = json["scan"].as_bool().unwrap();
let image_directory = json["image_directory"].to_string();
let test = json["test"].as_bool().unwrap();
let verbosity = json["verbosity"].as_u8().unwrap();
let config = Config {
download,
url,
scan,
image_directory: PathBuf::from(image_directory),
test,
verbosity,
};
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
config
}
#[named]
pub fn make_image_json(images: Vec<ImageBox>) -> JsonValue {
trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!());
let mut json = object! {
"info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#,
"images": array![]
};
for image in images {
let image_json = object! {
"hash": image.hash,
"date_added": image.date.to_rfc2822(),
"url": image.url,
"title": image.title,
"blacklisted": image.blacklisted,
};
json["images"].push(image_json).unwrap();
}
trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!());
json
}
#[named]
pub fn parse_image_json(json: JsonValue) -> Vec<ImageBox> {
debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!());
let mut images: Vec<ImageBox> = Vec::new();
for image in json["images"].members() {
let image_box = ImageBox {
url: image["url"].to_string(),
date: DateTime::from(
match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) {
Ok(date) => date,
Err(error) => {
warn!(
target: "w10s_webscraper",
"{} - Error parsing date, defaulting to unix 0: {}",
function_name!(),
error
);
DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap()
}
},
),
title: image["title"].to_string(),
hash: image["hash"].to_string(),
blacklisted: image["blacklisted"].as_bool().unwrap(),
};
images.push(image_box);
}
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
images
}

56
src/logging.rs Normal file
View File

@@ -0,0 +1,56 @@
use crate::CONFIG;
use log::LevelFilter;
use log4rs::append::console::ConsoleAppender;
use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller;
use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger;
use log4rs::append::rolling_file::policy::compound::CompoundPolicy;
use log4rs::append::rolling_file::RollingFileAppender;
use log4rs::config::{Appender, Config, Logger, Root};
use log4rs::encode::pattern::PatternEncoder;
use log4rs::filter::threshold::ThresholdFilter;
//use crate::CONFIG;
pub fn initialize_logging() -> log4rs::Handle {
// Initialize logging
let level: LevelFilter = match &CONFIG.verbosity {
0 => LevelFilter::Info,
1 => LevelFilter::Debug,
2 => LevelFilter::Trace,
_ => LevelFilter::Info,
};
let stdout = ConsoleAppender::builder()
.encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}")))
.build(); // This appender is filtered, but only later
let roller = FixedWindowRoller::builder()
.build("log/my{}.log", 50)
.unwrap();
let policy: CompoundPolicy =
CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller));
let file_logger = RollingFileAppender::builder()
.encoder(Box::new(PatternEncoder::new(
"{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}",
)))
.build("log/my.log", Box::new(policy))
.unwrap();
let config = Config::builder()
.appender(
Appender::builder()
.filter(Box::new(ThresholdFilter::new(level))) // This is the filter
.build("stdout", Box::new(stdout)),
)
.appender(Appender::builder().build("file_logger", Box::new(file_logger)))
.logger(
Logger::builder()
.additive(false) // If additive is true, you get double output from the stdout appender
.appender("stdout")
.appender("file_logger")
.build("w10s_webscraper", LevelFilter::Trace),
)
.build(Root::builder().appender("stdout").build(LevelFilter::Warn))
.unwrap();
let handle = log4rs::init_config(config).unwrap();
handle
}

82
src/main.rs Normal file
View File

@@ -0,0 +1,82 @@
use argparse::Config;
pub use function_name::named;
use json::JsonValue;
use lazy_static::lazy_static;
pub use log::{debug, error, info, trace, warn};
pub use log::{Level, LevelFilter};
mod download;
mod fileio;
mod image_data;
mod logging;
pub mod percentage;
mod scan;
pub use image_data::ImageBox;
pub mod argparse;
mod json_code;
mod web;
use json_code::make_image_json;
lazy_static! {
static ref CONFIG: Config = argparse::parse_args();
}
#[named]
fn main() {
logging::initialize_logging();
info!(target: "w10s_webscraper", "{} - Starting", function_name!());
trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!());
// Load json database of existing images in the image directory
let json = match fileio::read_json(&CONFIG.image_directory) {
Ok(json) => json,
Err(error) => {
if error.kind() == std::io::ErrorKind::NotFound {
warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!());
JsonValue::new_object()
} else {
error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error);
panic!("{} - Error reading json file: {}", function_name!(), error);
}
}
};
// Parse json into a vector of ImageBox structs
let mut old_images: Vec<ImageBox> = json_code::parse_image_json(json);
trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!());
// Create an empty vector of ImageBox structs to hold the new images
let mut images: Vec<ImageBox> = Vec::new();
trace!(target: "w10s_webscraper",
"{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both",
function_name!(),
CONFIG.scan
);
// Determine if we need to download images, scan the image directory, or both
if CONFIG.scan {
old_images = scan::scan(&mut old_images);
}
if CONFIG.download {
images = download::download_images(&old_images);
}
trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!());
// Merge the old and new image data
images.append(&mut old_images);
trace!(target: "w10s_webscraper", "{} - Writing json", function_name!());
// Write the new json file
let json = make_image_json(images);
trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!());
fileio::write_json(&CONFIG.image_directory, json);
info!(target: "w10s_webscraper", "{} - Finished", function_name!());
}
pub fn est_time(vector: usize) -> String {
let est_time = (vector as f64 * 5.1) / 60 as f64;
if est_time > 120.0 {
format!("{} hours", est_time / 60.0)
} else {
format!("{} minutes", est_time)
}
}

69
src/percentage.rs Normal file
View File

@@ -0,0 +1,69 @@
use function_name::named;
use log::{debug, info};
pub struct Percentage {
/*
This struct is used to track the percentage of a task that has been completed.
It is used to print progress to the console.
*/
threshold: f32,
step_size: f32,
total: usize,
count: usize,
percent: f32,
}
impl Percentage {
#[named]
pub fn new(total: usize) -> Percentage {
let mut step = 0.1;
let mut threshold = 0.1;
if total == 0 {
debug!(
"fn {} - Percentage::new() called with total = 0",
function_name!()
);
}
// If there are less than 10 items, set the step size to whatever percentage of the total each item represents
if total < 10 {
step = 1.0 / total as f32;
}
// If there are greater than 100 items, set the step size and threshold to 5%
if total > 100 {
step = 0.05;
threshold = 0.05;
}
// If there are greater than 1000 items, set the step size and threshold to 1%
if total > 1000 {
step = 0.01;
threshold = 0.01;
}
Percentage {
threshold,
step_size: step,
total,
count: 0,
percent: 0.0,
}
}
pub fn get_percent(&self) -> f32 {
self.percent
}
pub fn get_total(&self) -> usize {
self.total
}
pub fn update(&mut self, fn_name: &str) {
if self.total == 0 {
info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name);
}
// Update the progress
self.count += 1;
// Calculate the percentage
let percent: f32 = self.count as f32 / self.total as f32;
// If the percentage is greater than the threshold, print the percentage
if percent >= self.threshold {
info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0);
// Update the threshold
self.threshold += self.step_size;
}
}
}

53
src/scan.rs Normal file
View File

@@ -0,0 +1,53 @@
use crate::{named, CONFIG, ImageBox, fileio};
use log::info;
#[named]
pub fn scan(old_images: &mut Vec<ImageBox>) -> Vec<ImageBox> {
// Scan image directory for existing images
info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!());
let scanned_images: Vec<ImageBox> = fileio::read_images(&CONFIG.image_directory);
if scanned_images.len() == 0 {
info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!());
return old_images.to_vec();
}
let mut indexies_to_remove: Vec<usize> = Vec::new();
let mut pos: usize = 0;
// Determine the json entries that are not present in the directory
for image in &*old_images {
let mut is_present = false;
for scanned_image in &scanned_images {
if image.hash == scanned_image.hash {
is_present = true;
}
}
if !is_present && image.blacklisted == false {
indexies_to_remove.push(pos)
}
pos += 1;
}
// Remove the entries from the json
// Remove in reverse order to avoid index issues
indexies_to_remove.reverse();
for index in indexies_to_remove {
old_images.remove(index);
}
info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!());
// Add images that are in the directory, but not in the json
for image in &scanned_images {
let mut is_old = false;
for old_image in &*old_images {
if image.hash == old_image.hash {
is_old = true;
break;
}
}
if !is_old {
old_images.push(image.clone());
}
}
info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!());
old_images.to_vec()
}

84
src/web.rs Normal file
View File

@@ -0,0 +1,84 @@
use crate::ImageBox;
use crate::CONFIG;
pub mod fetch;
pub mod html;
use crate::percentage::Percentage;
use crate::{est_time, named};
use log::{error, info};
#[named]
pub fn get_page_links(url: &str, old_images: &Vec<ImageBox>) -> Vec<String> {
info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!());
let html = fetch::fetch_html(url).unwrap_or_else(|error| {
panic!("Problem fetching primary page: {}", error);
});
let mut page_links: Vec<String> = html::extract_image_page_links(&html, &old_images);
if CONFIG.test == false {
// if debug is false, then we want to get all of the pages
let page_count = html::extract_page_count(html);
let mut percent = Percentage::new(page_count as usize);
info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count);
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize));
let mut fully_skipped_page_count = 0;
for i in 2..page_count {
let url = format!("{}page/{}/", url, i);
let html = match fetch::fetch_html(&url) {
Ok(html) => html,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
continue;
}
};
let mut new_links: Vec<String> =
html::extract_image_page_links(&html, &old_images).to_vec();
// If we get no new links five pages in a row, then we can skip the rest of the pages
if new_links.len() == 0 {
fully_skipped_page_count += 1;
}
if new_links.len() > 0 {
fully_skipped_page_count = 0;
}
if fully_skipped_page_count > 5 {
info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!());
break;
}
page_links.append(&mut new_links);
percent.update(function_name!());
}
}
page_links
}
#[named]
pub fn get_image_data(urls: Vec<String>) -> Vec<ImageBox> {
let mut percent = Percentage::new(urls.len());
info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len());
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len()));
let mut images: Vec<ImageBox> = Vec::new();
for url in urls {
let html = match fetch::fetch_html(&url) {
Ok(html) => html,
Err(error) => {
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
continue;
}
};
let image_link = html::extract_image_url(&html);
let image_title = html::extract_image_title(&html);
let image_date = html::extract_image_date(&html);
let image_hash = url.split("/").last().unwrap().to_string();
let image = ImageBox {
url: image_link,
date: image_date,
title: image_title,
hash: image_hash,
blacklisted: false,
};
images.push(image);
percent.update(function_name!());
}
images
}

74
src/web/fetch.rs Normal file
View File

@@ -0,0 +1,74 @@
use crate::named;
use lazy_static::lazy_static;
use log::{debug, trace};
use reqwest::Error;
use scraper::Html;
use std::sync::Mutex;
use std::time::Instant;
/*
do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes
*/
lazy_static! {
static ref LAST_REQUEST_MUTEX: Mutex<Option<Instant>> = Mutex::new(None);
static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500);
}
pub fn do_throttled_request(url: &str) -> Result<reqwest::blocking::Response, Error> {
fn delay() {
let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap();
let last_request = last_request_mutex.take();
//let now = Instant::now();
if let Some(last_request) = last_request {
let duration = last_request.duration_since(last_request);
if duration < *REQUEST_DELAY {
std::thread::sleep(*REQUEST_DELAY - duration);
}
}
}
// First request
delay();
let mut resp = reqwest::blocking::get(url);
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
// Retry 5 times
if resp.is_err() {
for i in 0..5 {
delay();
resp = reqwest::blocking::get(url);
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
if resp.is_ok() {
break;
}
if i == 4 {
return resp; // Return error after 5 tries
}
}
}
resp
}
#[named]
pub fn fetch_html(url: &str) -> Result<scraper::Html, Error> {
trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url);
let resp = match do_throttled_request(url) {
Ok(resp) => resp,
Err(e) => {
debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url);
return Err(e);
}
};
let html = resp.text().unwrap();
let html = Html::parse_document(&html);
Ok(html)
}
pub fn fetch_image_bytes(url: &str) -> Result<Vec<u8>, reqwest::Error> {
let resp = do_throttled_request(url)?;
let bytes = resp.bytes()?;
Ok(bytes.to_vec())
}

143
src/web/html.rs Normal file
View File

@@ -0,0 +1,143 @@
use chrono::{DateTime, Local};
use log::{debug, error, warn};
use scraper::{Html, Selector};
use crate::{named, ImageBox};
#[named]
pub fn extract_page_count(html: Html) -> i32 {
let mut page_count: i32 = 0;
// select only the links that are page numbers
let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap();
let links = html.select(&selector);
// iterate over the links and find the highest page number
for link in links {
let href = link.value().attr("href").unwrap();
// get the last element of the link, which is the page number
let raw = href.split("/").collect::<Vec<&str>>();
let last = raw.last().unwrap();
let last = last.parse::<i32>().unwrap(); // cast the last element to an i32
if last > page_count {
page_count = last;
}
}
debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count);
page_count
}
#[named]
pub fn extract_image_page_links(html: &Html, old_images: &Vec<ImageBox>) -> Vec<String> {
let mut page_links: Vec<String> = Vec::new();
let selector = Selector::parse(r#"a[href]"#).unwrap();
let links = html.select(&selector);
for link in links {
// get the href attribute
let href: String = link.value().attr("href").unwrap().to_string();
// get the hash in the link
let hash: String = href
.split("/")
.collect::<Vec<&str>>()
.last()
.unwrap()
.to_string();
// if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list
let image = old_images.iter().find(|&x| x.hash == hash);
if href.contains("/images/")
&& href.contains("#respond") == false
&& href.contains("#comments") == false
&& page_links.contains(&href) == false
&& image == None
{
page_links.push(href.to_string());
}
}
debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links);
page_links
}
pub fn extract_image_url(html: &Html) -> String {
// construct a CSS selector that will grab all of the image tags
// This selector is not the html snippets themselves, but rather an object that knows how to select them
let selector = Selector::parse("img").unwrap();
// use the selector to find all img tags in the document
let images = html.select(&selector);
// iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable
let mut output: String = String::new();
for image in images {
// get the raw src attribute of the image tag
let src = image.value().attr("src").unwrap();
// output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576"
if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") {
//println!("{}", src);
// split the src attribute into a vector of strings, using the "-" character as the delimiter
let tempvec = src.split("-").collect::<Vec<&str>>();
// create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end
let mut temp_s = String::new();
temp_s.push_str(tempvec[0]);
// this keeps the '-' in ".com/wp-content/upl"
temp_s.push_str("-");
temp_s.push_str(tempvec[1]);
temp_s.push_str(".jpg");
output = temp_s;
//print!("{}", output)
}
}
output
}
#[named]
pub fn extract_image_date(html: &Html) -> DateTime<Local> {
let selector = Selector::parse(r#"span[class="date"]"#).unwrap();
let html_dates = html.select(&selector);
let mut dates: Vec<String> = Vec::new();
for date in html_dates {
let date = date.text().collect::<Vec<_>>();
dates.push(date[0].to_string())
}
if dates.len() > 1 {
warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!());
}
// date comes out of the html as "2020-01-01", but we need to add the time and timezone to it
// so we can parse it into a DateTime object
let mut datetime: String = dates[0].to_string();
datetime.push_str(" 12:00:00 -0500");
let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") {
Ok(datetime) => datetime.with_timezone(&Local),
Err(e) => {
error!(
target: "w10s_webscraper",
"{} - Error parsing date, using local now: {}",
function_name!(),
e
);
Local::now()
}
};
datetime
}
#[named]
pub fn extract_image_title(html: &Html) -> String {
let selector = Selector::parse(r#"title"#).unwrap();
let titles = html.select(&selector);
let mut output: Vec<String> = Vec::new();
for title in titles {
let title = title.text().collect::<Vec<_>>();
output.push(title[0].to_string())
}
if output.len() > 1 {
warn!(
target: "w10s_webscraper",
"{} - More than one title found. Using the first one ({})",
function_name!(),
output[0]
);
}
output[0]
.split(" | ")
.collect::<Vec<&str>>()
.first()
.unwrap()
.to_string()
}