mirror of
https://github.com/opus-tango/webscraper_clean.git
synced 2026-03-20 03:55:20 +00:00
Initial commit
This commit is contained in:
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# Auto detect text files and perform LF normalization
|
||||||
|
* text=auto
|
||||||
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# Generated by Cargo
|
||||||
|
# will have compiled files and executables
|
||||||
|
debug/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||||
|
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||||
|
Cargo.lock
|
||||||
|
|
||||||
|
# These are backup files generated by rustfmt
|
||||||
|
**/*.rs.bk
|
||||||
|
|
||||||
|
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||||
|
*.pdb
|
||||||
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"rust-analyzer.linkedProjects": [
|
||||||
|
".\\Cargo.toml"
|
||||||
|
]
|
||||||
|
}
|
||||||
35
Cargo.toml
Normal file
35
Cargo.toml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[package]
|
||||||
|
author = "Nayan Sawyer"
|
||||||
|
name = "w10s_webscraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "----\n\nAn example webscraper\nfolder defaults to current directory"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
chrono = { version = "0.4.26", features = ["clock"] }
|
||||||
|
clap = { version = "4.3.9", features = ["derive"] }
|
||||||
|
env_logger = "0.10.0"
|
||||||
|
filetime_creation = "0.1.5"
|
||||||
|
function_name = "0.3.0"
|
||||||
|
hex-literal = "0.4.1"
|
||||||
|
json = "0.12.4"
|
||||||
|
lazy_static = "1.4.0"
|
||||||
|
log = "0.4.19"
|
||||||
|
log4rs = "1.2.0"
|
||||||
|
md5 = "0.7.0"
|
||||||
|
reqwest = { version = "0.11.18", features = ["blocking"] }
|
||||||
|
scraper = "0.16.0"
|
||||||
|
serde-value = "0.7.0"
|
||||||
|
|
||||||
|
[profile.dev]
|
||||||
|
opt-level = 0
|
||||||
|
debug = true
|
||||||
|
debug-assertions = true
|
||||||
|
overflow-checks = true
|
||||||
|
lto = false
|
||||||
|
panic = 'unwind'
|
||||||
|
incremental = true
|
||||||
|
codegen-units = 256
|
||||||
|
rpath = false
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 NSawyer
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
2
README.md
Normal file
2
README.md
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# webscraper_clean
|
||||||
|
An example webscraper written in Rust
|
||||||
7
config.json
Normal file
7
config.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"download": true,
|
||||||
|
"url": "",
|
||||||
|
"scan": true,
|
||||||
|
"image_directory": "images",
|
||||||
|
"test": true
|
||||||
|
}
|
||||||
0
log/my.log
Normal file
0
log/my.log
Normal file
14
notes.txt
Normal file
14
notes.txt
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
arguments
|
||||||
|
|
||||||
|
debug
|
||||||
|
target directory
|
||||||
|
scan directory
|
||||||
|
-vertical
|
||||||
|
|
||||||
|
|
||||||
|
File format
|
||||||
|
------------
|
||||||
|
name:
|
||||||
|
hash:
|
||||||
|
date_added:
|
||||||
|
blacklist:
|
||||||
108
src/argparse.rs
Normal file
108
src/argparse.rs
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
use crate::{json_code, named};
|
||||||
|
use clap::Parser;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Cli {
|
||||||
|
#[arg(short, long, value_name = "FILE")]
|
||||||
|
pub folder: Option<PathBuf>,
|
||||||
|
|
||||||
|
#[arg(short, long, value_name = "URL")]
|
||||||
|
pub url: Option<String>,
|
||||||
|
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub scan: bool,
|
||||||
|
|
||||||
|
// Turn debugging information on
|
||||||
|
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||||
|
pub verbosity: u8,
|
||||||
|
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub download: bool,
|
||||||
|
|
||||||
|
// Turn testing mode on
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub test: bool,
|
||||||
|
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub create_config_file: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Config {
|
||||||
|
pub verbosity: u8,
|
||||||
|
pub url: String,
|
||||||
|
pub scan: bool,
|
||||||
|
pub image_directory: PathBuf,
|
||||||
|
pub download: bool,
|
||||||
|
pub test: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn parse_args() -> Config {
|
||||||
|
// If config file is present, read it and use it to override the command line arguments
|
||||||
|
if let Ok(json) = std::fs::read_to_string("config.json") {
|
||||||
|
let json: json::JsonValue = match json::parse(&json) {
|
||||||
|
Ok(json) => json,
|
||||||
|
Err(e) => {
|
||||||
|
println!("fn {} - Error parsing config.json: {}", function_name!(), e);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let config = json_code::parse_config_json(json);
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
// Parse command line arguments
|
||||||
|
let cli = Cli::parse();
|
||||||
|
let mut config = Config {
|
||||||
|
download: false,
|
||||||
|
url: String::from(""),
|
||||||
|
scan: false,
|
||||||
|
image_directory: PathBuf::from("images"),
|
||||||
|
test: false,
|
||||||
|
verbosity: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
if cli.scan == false && cli.download == false {
|
||||||
|
println!(
|
||||||
|
"fn {} - You must specify either --scan, --download, or --help",
|
||||||
|
function_name!()
|
||||||
|
);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
if let Some(path) = cli.folder.as_deref() {
|
||||||
|
config.image_directory = path.to_path_buf();
|
||||||
|
}
|
||||||
|
if cli.scan {
|
||||||
|
config.scan = cli.scan;
|
||||||
|
}
|
||||||
|
if let Some(url) = cli.url.as_deref() {
|
||||||
|
config.url = url.to_string();
|
||||||
|
}
|
||||||
|
if cli.test {
|
||||||
|
config.test = cli.test;
|
||||||
|
}
|
||||||
|
|
||||||
|
if cli.download {
|
||||||
|
config.download = cli.download;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the image directory is the default, create it if it doesn't exist
|
||||||
|
if config.image_directory.eq(&PathBuf::from("images")) {
|
||||||
|
if config.image_directory.exists() == false {
|
||||||
|
std::fs::create_dir(&config.image_directory).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If create_config_file is true, create the config file and exit
|
||||||
|
if cli.create_config_file {
|
||||||
|
let json = json_code::make_config_json(config);
|
||||||
|
let json = json.pretty(2);
|
||||||
|
std::fs::write("config.json", json).unwrap();
|
||||||
|
println!("fn {} - Created config file", function_name!());
|
||||||
|
std::process::exit(0);
|
||||||
|
}
|
||||||
|
config
|
||||||
|
}
|
||||||
36
src/download.rs
Normal file
36
src/download.rs
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
use crate::{est_time, fileio, named, percentage, web, ImageBox, CONFIG};
|
||||||
|
use log::{error, info, warn};
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn download_images(old_images: &Vec<ImageBox>) -> Vec<ImageBox> {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Downloading images", function_name!());
|
||||||
|
// Get the page links from the website
|
||||||
|
let page_links = web::get_page_links(&CONFIG.url, &old_images);
|
||||||
|
// Get the image data from each page of the website, keeping data on only the new images
|
||||||
|
let images: Vec<ImageBox> = web::get_image_data(page_links);
|
||||||
|
|
||||||
|
let mut percent = percentage::Percentage::new(images.len() as usize);
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Downloading and writing {} images", function_name!(), images.len());
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(images.len()));
|
||||||
|
// Download and write each new image
|
||||||
|
for image in &images {
|
||||||
|
match fileio::write_image(&image, &CONFIG.image_directory) {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(error) => {
|
||||||
|
match error.kind() {
|
||||||
|
std::io::ErrorKind::AlreadyExists => {
|
||||||
|
warn!(target: "w10s_webscraper", "fn {} - Image already exists, skipping: {}", function_name!(), image.hash);
|
||||||
|
}
|
||||||
|
std::io::ErrorKind::Other => {
|
||||||
|
warn!(target: "w10s_webscraper", "fn {} - Error fetching image bytes, skipping: {}", function_name!(), error);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
error!(target: "w10s_webscraper", "fn {} - Error writing image, skipping: {}", function_name!(), error);
|
||||||
|
} // No need "continue", as the image is not written
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
percent.update(function_name!());
|
||||||
|
}
|
||||||
|
images
|
||||||
|
}
|
||||||
137
src/fileio.rs
Normal file
137
src/fileio.rs
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
use chrono::Local;
|
||||||
|
use filetime_creation::{set_file_ctime, FileTime};
|
||||||
|
use json::JsonValue;
|
||||||
|
use log::{debug, error, info, trace};
|
||||||
|
use md5;
|
||||||
|
use std::fs::{self, File};
|
||||||
|
use std::io::prelude::*;
|
||||||
|
use std::io::Error;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use crate::named;
|
||||||
|
use crate::percentage::Percentage;
|
||||||
|
use crate::web::fetch::fetch_image_bytes;
|
||||||
|
use crate::ImageBox;
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn write_image(image: &ImageBox, image_directory: &Path) -> Result<(), Error> {
|
||||||
|
// Join the image_directory path with the image title and .jpg
|
||||||
|
let image_path = image_directory.join(&image.title).with_extension(".jpg");
|
||||||
|
// Create the image file
|
||||||
|
let mut out = File::create(&image_path)?;
|
||||||
|
// Fetch the image bytes
|
||||||
|
let mut content = match fetch_image_bytes(&image.url) {
|
||||||
|
Ok(content) => content,
|
||||||
|
Err(error) => {
|
||||||
|
return Err(Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
format!("Error fetching image bytes from {}: {}", &image.url, error),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Write the image bytes to the image file
|
||||||
|
out.write_all(&mut content)?;
|
||||||
|
|
||||||
|
// Next we need to set the creation date of the file to the date of the image
|
||||||
|
let image_time = FileTime::from_unix_time(image.date.timestamp(), 0);
|
||||||
|
set_file_ctime(&image_path, image_time)?;
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Image {} written", function_name!(), image.title);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn read_images(image_directory: &Path) -> Vec<ImageBox> {
|
||||||
|
// reads the image directory and returns a vector of ImageBox structs with hashes from the actual images
|
||||||
|
let mut images: Vec<ImageBox> = Vec::new();
|
||||||
|
// Iterate over the files in the image directory
|
||||||
|
let files = fs::read_dir(&image_directory)
|
||||||
|
.unwrap_or_else(|error| {
|
||||||
|
error!(target: "w10s_webscraper", "fn {} - Error reading image directory: {}", function_name!(), error);
|
||||||
|
panic!(
|
||||||
|
"{} - Error reading image directory: {}",
|
||||||
|
function_name!(),
|
||||||
|
error
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let mut percent = Percentage::new(files.len());
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Reading {} files", function_name!(), files.len());
|
||||||
|
for file in files {
|
||||||
|
let file = match file {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(error) => {
|
||||||
|
error!(target: "w10s_webscraper", "fn {} - Error reading file, skipping: {}", function_name!(), error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// If the file is a jpg, read the hash from the file and add it to the vector
|
||||||
|
let jpg = match file.path().extension() {
|
||||||
|
Some(str) => {
|
||||||
|
if str == "jpg" {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
trace!(target: "w10s_webscraper", "fn {} - File is not a jpg, skipping: {}", function_name!(), file.path().display());
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
trace!(target: "w10s_webscraper", "fn {} - File has no extension?, skipping: {}", function_name!(), file.path().display());
|
||||||
|
false
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if jpg {
|
||||||
|
let hash = md5::compute(std::fs::read(file.path()).unwrap());
|
||||||
|
let image_box = ImageBox {
|
||||||
|
url: "scanned".to_string(),
|
||||||
|
date: Local::now(),
|
||||||
|
title: file.file_name().into_string().unwrap(),
|
||||||
|
hash: format!("{:x}", hash),
|
||||||
|
blacklisted: false,
|
||||||
|
};
|
||||||
|
images.push(image_box);
|
||||||
|
trace!(
|
||||||
|
target: "w10s_webscraper",
|
||||||
|
"fn {} - Image {} read",
|
||||||
|
function_name!(),
|
||||||
|
&file.file_name().into_string().unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
percent.update(function_name!());
|
||||||
|
}
|
||||||
|
images
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn read_json(image_directory: &Path) -> Result<JsonValue, Error> {
|
||||||
|
let path = image_directory.join("hashes.json");
|
||||||
|
// Read the json from the file and return it
|
||||||
|
trace!(
|
||||||
|
target: "w10s_webscraper",
|
||||||
|
"fn {} - Reading json file, expect confirmation",
|
||||||
|
function_name!()
|
||||||
|
);
|
||||||
|
let mut file = File::open(path)?;
|
||||||
|
trace!(target: "w10s_webscraper", "fn {} - json file read", function_name!());
|
||||||
|
|
||||||
|
let mut buf = String::new();
|
||||||
|
file.read_to_string(&mut buf)?;
|
||||||
|
let json = json::parse(&buf).unwrap();
|
||||||
|
debug!(target: "w10s_webscraper", "fn {} - Loaded json file", function_name!());
|
||||||
|
Ok(json)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn write_json(path: &Path, json: JsonValue) {
|
||||||
|
let json = json.pretty(2);
|
||||||
|
let path = path.join("hashes.json");
|
||||||
|
// Create file, and overwrite it if it exists
|
||||||
|
let mut file = match File::create(path) {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(error) => panic!("{} - Error creating json file: {}", function_name!(), error),
|
||||||
|
};
|
||||||
|
match file.write_all(json.as_bytes()) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(error) => panic!("{} - Error writing json file: {}", function_name!(), error),
|
||||||
|
};
|
||||||
|
debug!(target: "w10s_webscraper", "fn {} - Wrote json file", function_name!());
|
||||||
|
}
|
||||||
41
src/image_data.rs
Normal file
41
src/image_data.rs
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
use chrono::{DateTime, Local};
|
||||||
|
|
||||||
|
pub struct ImageBox {
|
||||||
|
pub url: String,
|
||||||
|
pub date: DateTime<Local>,
|
||||||
|
pub title: String,
|
||||||
|
pub hash: String,
|
||||||
|
pub blacklisted: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for ImageBox {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{{\n\turl: {}\n\tdate: {}\n\ttitle: {}\n\thash: {}\n\tblacklisted: {}\n}}",
|
||||||
|
self.url,
|
||||||
|
self.date.date_naive(),
|
||||||
|
self.title,
|
||||||
|
self.hash,
|
||||||
|
self.blacklisted
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for ImageBox {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.hash == other.hash
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for ImageBox {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
ImageBox {
|
||||||
|
url: self.url.clone(),
|
||||||
|
date: self.date.clone(),
|
||||||
|
title: self.title.clone(),
|
||||||
|
hash: self.hash.clone(),
|
||||||
|
blacklisted: self.blacklisted.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
107
src/json_code.rs
Normal file
107
src/json_code.rs
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use crate::{image_data::ImageBox, named, Config};
|
||||||
|
use chrono::DateTime;
|
||||||
|
use json::{array, object, JsonValue};
|
||||||
|
use log::{debug, trace, warn};
|
||||||
|
|
||||||
|
/*
|
||||||
|
The json code is pretty much all just converting structs to json and back again.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn make_config_json(config: Config) -> JsonValue {
|
||||||
|
debug!(target: "w10s_webscraper", "{} - Converting config data to json", function_name!());
|
||||||
|
let download = config.download;
|
||||||
|
let url = &config.url;
|
||||||
|
let scan = config.scan;
|
||||||
|
let image_directory = &config.image_directory;
|
||||||
|
let image_directory = image_directory.to_str().unwrap();
|
||||||
|
let test = config.test;
|
||||||
|
let verbosity = config.verbosity;
|
||||||
|
|
||||||
|
|
||||||
|
let json = object! {
|
||||||
|
"download": download,
|
||||||
|
"url": url.to_string(),
|
||||||
|
"scan": scan,
|
||||||
|
"image_directory": image_directory,
|
||||||
|
"test": test,
|
||||||
|
"verbosity": verbosity,
|
||||||
|
};
|
||||||
|
json
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn parse_config_json(json: JsonValue) -> Config {
|
||||||
|
debug!(target: "w10s_webscraper", "{} - Parsing config json", function_name!());
|
||||||
|
let download = json["download"].as_bool().unwrap();
|
||||||
|
let url = json["url"].to_string();
|
||||||
|
let scan = json["scan"].as_bool().unwrap();
|
||||||
|
let image_directory = json["image_directory"].to_string();
|
||||||
|
let test = json["test"].as_bool().unwrap();
|
||||||
|
let verbosity = json["verbosity"].as_u8().unwrap();
|
||||||
|
|
||||||
|
let config = Config {
|
||||||
|
download,
|
||||||
|
url,
|
||||||
|
scan,
|
||||||
|
image_directory: PathBuf::from(image_directory),
|
||||||
|
test,
|
||||||
|
verbosity,
|
||||||
|
};
|
||||||
|
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
|
||||||
|
config
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn make_image_json(images: Vec<ImageBox>) -> JsonValue {
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Converting image data to json", function_name!());
|
||||||
|
let mut json = object! {
|
||||||
|
"info": r#"A file with "blacklist" = "true" means that the image entry will remain in the database, but will not be downloaded. This allows you to delete a photo and not download it again. Blacklisted images will keep their entries when the image is absent and you run a file scan."#,
|
||||||
|
"images": array![]
|
||||||
|
};
|
||||||
|
for image in images {
|
||||||
|
let image_json = object! {
|
||||||
|
"hash": image.hash,
|
||||||
|
"date_added": image.date.to_rfc2822(),
|
||||||
|
"url": image.url,
|
||||||
|
"title": image.title,
|
||||||
|
"blacklisted": image.blacklisted,
|
||||||
|
};
|
||||||
|
json["images"].push(image_json).unwrap();
|
||||||
|
}
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Finished conversion", function_name!());
|
||||||
|
json
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn parse_image_json(json: JsonValue) -> Vec<ImageBox> {
|
||||||
|
debug!(target: "w10s_webscraper", "{} - Parsing image json", function_name!());
|
||||||
|
let mut images: Vec<ImageBox> = Vec::new();
|
||||||
|
for image in json["images"].members() {
|
||||||
|
let image_box = ImageBox {
|
||||||
|
url: image["url"].to_string(),
|
||||||
|
date: DateTime::from(
|
||||||
|
match DateTime::parse_from_rfc2822(image["date_added"].to_string().as_str()) {
|
||||||
|
Ok(date) => date,
|
||||||
|
Err(error) => {
|
||||||
|
warn!(
|
||||||
|
target: "w10s_webscraper",
|
||||||
|
"{} - Error parsing date, defaulting to unix 0: {}",
|
||||||
|
function_name!(),
|
||||||
|
error
|
||||||
|
);
|
||||||
|
DateTime::parse_from_rfc2822("Thu, 01 Jan 1970 00:00:00 +0000").unwrap()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
),
|
||||||
|
title: image["title"].to_string(),
|
||||||
|
hash: image["hash"].to_string(),
|
||||||
|
blacklisted: image["blacklisted"].as_bool().unwrap(),
|
||||||
|
};
|
||||||
|
images.push(image_box);
|
||||||
|
}
|
||||||
|
debug!(target: "w10s_webscraper", "{} - Finished parsing json", function_name!());
|
||||||
|
images
|
||||||
|
}
|
||||||
56
src/logging.rs
Normal file
56
src/logging.rs
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
use crate::CONFIG;
|
||||||
|
use log::LevelFilter;
|
||||||
|
use log4rs::append::console::ConsoleAppender;
|
||||||
|
use log4rs::append::rolling_file::policy::compound::roll::fixed_window::FixedWindowRoller;
|
||||||
|
use log4rs::append::rolling_file::policy::compound::trigger::size::SizeTrigger;
|
||||||
|
use log4rs::append::rolling_file::policy::compound::CompoundPolicy;
|
||||||
|
use log4rs::append::rolling_file::RollingFileAppender;
|
||||||
|
use log4rs::config::{Appender, Config, Logger, Root};
|
||||||
|
use log4rs::encode::pattern::PatternEncoder;
|
||||||
|
use log4rs::filter::threshold::ThresholdFilter;
|
||||||
|
//use crate::CONFIG;
|
||||||
|
|
||||||
|
pub fn initialize_logging() -> log4rs::Handle {
|
||||||
|
// Initialize logging
|
||||||
|
|
||||||
|
let level: LevelFilter = match &CONFIG.verbosity {
|
||||||
|
0 => LevelFilter::Info,
|
||||||
|
1 => LevelFilter::Debug,
|
||||||
|
2 => LevelFilter::Trace,
|
||||||
|
_ => LevelFilter::Info,
|
||||||
|
};
|
||||||
|
let stdout = ConsoleAppender::builder()
|
||||||
|
.encoder(Box::new(PatternEncoder::new("{h({l})}: {m}{n}")))
|
||||||
|
.build(); // This appender is filtered, but only later
|
||||||
|
|
||||||
|
let roller = FixedWindowRoller::builder()
|
||||||
|
.build("log/my{}.log", 50)
|
||||||
|
.unwrap();
|
||||||
|
let policy: CompoundPolicy =
|
||||||
|
CompoundPolicy::new(Box::new(SizeTrigger::new(50 * 1024)), Box::new(roller));
|
||||||
|
let file_logger = RollingFileAppender::builder()
|
||||||
|
.encoder(Box::new(PatternEncoder::new(
|
||||||
|
"{d(%Y-%m-%d %H:%M:%S)(utc)} - {h({l})}: {m}{n}",
|
||||||
|
)))
|
||||||
|
.build("log/my.log", Box::new(policy))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let config = Config::builder()
|
||||||
|
.appender(
|
||||||
|
Appender::builder()
|
||||||
|
.filter(Box::new(ThresholdFilter::new(level))) // This is the filter
|
||||||
|
.build("stdout", Box::new(stdout)),
|
||||||
|
)
|
||||||
|
.appender(Appender::builder().build("file_logger", Box::new(file_logger)))
|
||||||
|
.logger(
|
||||||
|
Logger::builder()
|
||||||
|
.additive(false) // If additive is true, you get double output from the stdout appender
|
||||||
|
.appender("stdout")
|
||||||
|
.appender("file_logger")
|
||||||
|
.build("w10s_webscraper", LevelFilter::Trace),
|
||||||
|
)
|
||||||
|
.build(Root::builder().appender("stdout").build(LevelFilter::Warn))
|
||||||
|
.unwrap();
|
||||||
|
let handle = log4rs::init_config(config).unwrap();
|
||||||
|
handle
|
||||||
|
}
|
||||||
82
src/main.rs
Normal file
82
src/main.rs
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
use argparse::Config;
|
||||||
|
pub use function_name::named;
|
||||||
|
use json::JsonValue;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
pub use log::{debug, error, info, trace, warn};
|
||||||
|
pub use log::{Level, LevelFilter};
|
||||||
|
|
||||||
|
mod download;
|
||||||
|
mod fileio;
|
||||||
|
mod image_data;
|
||||||
|
mod logging;
|
||||||
|
pub mod percentage;
|
||||||
|
mod scan;
|
||||||
|
pub use image_data::ImageBox;
|
||||||
|
pub mod argparse;
|
||||||
|
mod json_code;
|
||||||
|
mod web;
|
||||||
|
use json_code::make_image_json;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref CONFIG: Config = argparse::parse_args();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
fn main() {
|
||||||
|
logging::initialize_logging();
|
||||||
|
|
||||||
|
info!(target: "w10s_webscraper", "{} - Starting", function_name!());
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Beginning of json loading", function_name!());
|
||||||
|
// Load json database of existing images in the image directory
|
||||||
|
let json = match fileio::read_json(&CONFIG.image_directory) {
|
||||||
|
Ok(json) => json,
|
||||||
|
Err(error) => {
|
||||||
|
if error.kind() == std::io::ErrorKind::NotFound {
|
||||||
|
warn!(target: "w10s_webscraper", "{} - Json file not found, will create one", function_name!());
|
||||||
|
JsonValue::new_object()
|
||||||
|
} else {
|
||||||
|
error!(target: "w10s_webscraper", "{} - Error reading json file: {}", function_name!(), error);
|
||||||
|
panic!("{} - Error reading json file: {}", function_name!(), error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Parse json into a vector of ImageBox structs
|
||||||
|
let mut old_images: Vec<ImageBox> = json_code::parse_image_json(json);
|
||||||
|
trace!(target: "w10s_webscraper", "{} - End of json loading", function_name!());
|
||||||
|
|
||||||
|
// Create an empty vector of ImageBox structs to hold the new images
|
||||||
|
let mut images: Vec<ImageBox> = Vec::new();
|
||||||
|
|
||||||
|
trace!(target: "w10s_webscraper",
|
||||||
|
"{} - CONFIG.scan={}, 0: download only, 2: scan only, 1: do both",
|
||||||
|
function_name!(),
|
||||||
|
CONFIG.scan
|
||||||
|
);
|
||||||
|
// Determine if we need to download images, scan the image directory, or both
|
||||||
|
if CONFIG.scan {
|
||||||
|
old_images = scan::scan(&mut old_images);
|
||||||
|
}
|
||||||
|
if CONFIG.download {
|
||||||
|
images = download::download_images(&old_images);
|
||||||
|
}
|
||||||
|
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Merging old and new image data", function_name!());
|
||||||
|
// Merge the old and new image data
|
||||||
|
images.append(&mut old_images);
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Writing json", function_name!());
|
||||||
|
|
||||||
|
// Write the new json file
|
||||||
|
let json = make_image_json(images);
|
||||||
|
trace!(target: "w10s_webscraper", "{} - Writing json to file", function_name!());
|
||||||
|
fileio::write_json(&CONFIG.image_directory, json);
|
||||||
|
info!(target: "w10s_webscraper", "{} - Finished", function_name!());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn est_time(vector: usize) -> String {
|
||||||
|
let est_time = (vector as f64 * 5.1) / 60 as f64;
|
||||||
|
if est_time > 120.0 {
|
||||||
|
format!("{} hours", est_time / 60.0)
|
||||||
|
} else {
|
||||||
|
format!("{} minutes", est_time)
|
||||||
|
}
|
||||||
|
}
|
||||||
69
src/percentage.rs
Normal file
69
src/percentage.rs
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
use function_name::named;
|
||||||
|
use log::{debug, info};
|
||||||
|
pub struct Percentage {
|
||||||
|
/*
|
||||||
|
This struct is used to track the percentage of a task that has been completed.
|
||||||
|
It is used to print progress to the console.
|
||||||
|
*/
|
||||||
|
threshold: f32,
|
||||||
|
step_size: f32,
|
||||||
|
total: usize,
|
||||||
|
count: usize,
|
||||||
|
percent: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Percentage {
|
||||||
|
#[named]
|
||||||
|
pub fn new(total: usize) -> Percentage {
|
||||||
|
let mut step = 0.1;
|
||||||
|
let mut threshold = 0.1;
|
||||||
|
if total == 0 {
|
||||||
|
debug!(
|
||||||
|
"fn {} - Percentage::new() called with total = 0",
|
||||||
|
function_name!()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// If there are less than 10 items, set the step size to whatever percentage of the total each item represents
|
||||||
|
if total < 10 {
|
||||||
|
step = 1.0 / total as f32;
|
||||||
|
}
|
||||||
|
// If there are greater than 100 items, set the step size and threshold to 5%
|
||||||
|
if total > 100 {
|
||||||
|
step = 0.05;
|
||||||
|
threshold = 0.05;
|
||||||
|
}
|
||||||
|
// If there are greater than 1000 items, set the step size and threshold to 1%
|
||||||
|
if total > 1000 {
|
||||||
|
step = 0.01;
|
||||||
|
threshold = 0.01;
|
||||||
|
}
|
||||||
|
Percentage {
|
||||||
|
threshold,
|
||||||
|
step_size: step,
|
||||||
|
total,
|
||||||
|
count: 0,
|
||||||
|
percent: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn get_percent(&self) -> f32 {
|
||||||
|
self.percent
|
||||||
|
}
|
||||||
|
pub fn get_total(&self) -> usize {
|
||||||
|
self.total
|
||||||
|
}
|
||||||
|
pub fn update(&mut self, fn_name: &str) {
|
||||||
|
if self.total == 0 {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Percentage done: zero items", fn_name);
|
||||||
|
}
|
||||||
|
// Update the progress
|
||||||
|
self.count += 1;
|
||||||
|
// Calculate the percentage
|
||||||
|
let percent: f32 = self.count as f32 / self.total as f32;
|
||||||
|
// If the percentage is greater than the threshold, print the percentage
|
||||||
|
if percent >= self.threshold {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - {:.0}%", fn_name, percent * 100.0);
|
||||||
|
// Update the threshold
|
||||||
|
self.threshold += self.step_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
53
src/scan.rs
Normal file
53
src/scan.rs
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
use crate::{named, CONFIG, ImageBox, fileio};
|
||||||
|
use log::info;
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn scan(old_images: &mut Vec<ImageBox>) -> Vec<ImageBox> {
|
||||||
|
// Scan image directory for existing images
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Scanning directory for existing images", function_name!());
|
||||||
|
let scanned_images: Vec<ImageBox> = fileio::read_images(&CONFIG.image_directory);
|
||||||
|
if scanned_images.len() == 0 {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - No images found in directory, stopping scan", function_name!());
|
||||||
|
return old_images.to_vec();
|
||||||
|
}
|
||||||
|
let mut indexies_to_remove: Vec<usize> = Vec::new();
|
||||||
|
let mut pos: usize = 0;
|
||||||
|
// Determine the json entries that are not present in the directory
|
||||||
|
for image in &*old_images {
|
||||||
|
let mut is_present = false;
|
||||||
|
for scanned_image in &scanned_images {
|
||||||
|
if image.hash == scanned_image.hash {
|
||||||
|
is_present = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !is_present && image.blacklisted == false {
|
||||||
|
indexies_to_remove.push(pos)
|
||||||
|
}
|
||||||
|
pos += 1;
|
||||||
|
}
|
||||||
|
// Remove the entries from the json
|
||||||
|
// Remove in reverse order to avoid index issues
|
||||||
|
indexies_to_remove.reverse();
|
||||||
|
for index in indexies_to_remove {
|
||||||
|
old_images.remove(index);
|
||||||
|
}
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Purged absent images from database", function_name!());
|
||||||
|
|
||||||
|
// Add images that are in the directory, but not in the json
|
||||||
|
for image in &scanned_images {
|
||||||
|
let mut is_old = false;
|
||||||
|
for old_image in &*old_images {
|
||||||
|
if image.hash == old_image.hash {
|
||||||
|
is_old = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !is_old {
|
||||||
|
old_images.push(image.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Added new images to database", function_name!());
|
||||||
|
|
||||||
|
old_images.to_vec()
|
||||||
|
}
|
||||||
84
src/web.rs
Normal file
84
src/web.rs
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
use crate::ImageBox;
|
||||||
|
use crate::CONFIG;
|
||||||
|
pub mod fetch;
|
||||||
|
pub mod html;
|
||||||
|
use crate::percentage::Percentage;
|
||||||
|
use crate::{est_time, named};
|
||||||
|
use log::{error, info};
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn get_page_links(url: &str, old_images: &Vec<ImageBox>) -> Vec<String> {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Collecting page links", function_name!());
|
||||||
|
let html = fetch::fetch_html(url).unwrap_or_else(|error| {
|
||||||
|
panic!("Problem fetching primary page: {}", error);
|
||||||
|
});
|
||||||
|
let mut page_links: Vec<String> = html::extract_image_page_links(&html, &old_images);
|
||||||
|
if CONFIG.test == false {
|
||||||
|
// if debug is false, then we want to get all of the pages
|
||||||
|
let page_count = html::extract_page_count(html);
|
||||||
|
let mut percent = Percentage::new(page_count as usize);
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Scanning {} pages for links", function_name!(), page_count);
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(page_count as usize));
|
||||||
|
let mut fully_skipped_page_count = 0;
|
||||||
|
for i in 2..page_count {
|
||||||
|
let url = format!("{}page/{}/", url, i);
|
||||||
|
let html = match fetch::fetch_html(&url) {
|
||||||
|
Ok(html) => html,
|
||||||
|
Err(error) => {
|
||||||
|
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut new_links: Vec<String> =
|
||||||
|
html::extract_image_page_links(&html, &old_images).to_vec();
|
||||||
|
|
||||||
|
// If we get no new links five pages in a row, then we can skip the rest of the pages
|
||||||
|
if new_links.len() == 0 {
|
||||||
|
fully_skipped_page_count += 1;
|
||||||
|
}
|
||||||
|
if new_links.len() > 0 {
|
||||||
|
fully_skipped_page_count = 0;
|
||||||
|
}
|
||||||
|
if fully_skipped_page_count > 5 {
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - No new images found for five pages, stopping", function_name!());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
page_links.append(&mut new_links);
|
||||||
|
percent.update(function_name!());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
page_links
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn get_image_data(urls: Vec<String>) -> Vec<ImageBox> {
|
||||||
|
let mut percent = Percentage::new(urls.len());
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Collecting data on {} images", function_name!(), urls.len());
|
||||||
|
info!(target: "w10s_webscraper", "fn {} - Estimated time: {}", function_name!(), est_time(urls.len()));
|
||||||
|
let mut images: Vec<ImageBox> = Vec::new();
|
||||||
|
for url in urls {
|
||||||
|
let html = match fetch::fetch_html(&url) {
|
||||||
|
Ok(html) => html,
|
||||||
|
Err(error) => {
|
||||||
|
error!(target: "w10s_webscraper", "fn {} - Problem fetching page {}: {}", function_name!(), url, error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let image_link = html::extract_image_url(&html);
|
||||||
|
let image_title = html::extract_image_title(&html);
|
||||||
|
let image_date = html::extract_image_date(&html);
|
||||||
|
let image_hash = url.split("/").last().unwrap().to_string();
|
||||||
|
let image = ImageBox {
|
||||||
|
url: image_link,
|
||||||
|
date: image_date,
|
||||||
|
title: image_title,
|
||||||
|
hash: image_hash,
|
||||||
|
blacklisted: false,
|
||||||
|
};
|
||||||
|
images.push(image);
|
||||||
|
percent.update(function_name!());
|
||||||
|
}
|
||||||
|
images
|
||||||
|
}
|
||||||
74
src/web/fetch.rs
Normal file
74
src/web/fetch.rs
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
use crate::named;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use log::{debug, trace};
|
||||||
|
use reqwest::Error;
|
||||||
|
use scraper::Html;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
/*
|
||||||
|
do_throttled_request is heavily inspired by https://github.com/gregstoll/rust-scraping, but I've made a lot of changes
|
||||||
|
*/
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref LAST_REQUEST_MUTEX: Mutex<Option<Instant>> = Mutex::new(None);
|
||||||
|
static ref REQUEST_DELAY: std::time::Duration = std::time::Duration::from_millis(500);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn do_throttled_request(url: &str) -> Result<reqwest::blocking::Response, Error> {
|
||||||
|
fn delay() {
|
||||||
|
let mut last_request_mutex = LAST_REQUEST_MUTEX.lock().unwrap();
|
||||||
|
let last_request = last_request_mutex.take();
|
||||||
|
//let now = Instant::now();
|
||||||
|
|
||||||
|
if let Some(last_request) = last_request {
|
||||||
|
let duration = last_request.duration_since(last_request);
|
||||||
|
if duration < *REQUEST_DELAY {
|
||||||
|
std::thread::sleep(*REQUEST_DELAY - duration);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// First request
|
||||||
|
delay();
|
||||||
|
let mut resp = reqwest::blocking::get(url);
|
||||||
|
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
|
||||||
|
// Retry 5 times
|
||||||
|
if resp.is_err() {
|
||||||
|
for i in 0..5 {
|
||||||
|
delay();
|
||||||
|
resp = reqwest::blocking::get(url);
|
||||||
|
LAST_REQUEST_MUTEX.lock().unwrap().replace(Instant::now());
|
||||||
|
|
||||||
|
if resp.is_ok() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if i == 4 {
|
||||||
|
return resp; // Return error after 5 tries
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resp
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn fetch_html(url: &str) -> Result<scraper::Html, Error> {
|
||||||
|
trace!(target: "w10s_webscraper", "fn {} - Fetching HTML from {}", function_name!(), url);
|
||||||
|
let resp = match do_throttled_request(url) {
|
||||||
|
Ok(resp) => resp,
|
||||||
|
Err(e) => {
|
||||||
|
debug!(target: "w10s_webscraper", "fn {} - Error fetching HTML from {}", function_name!(), url);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let html = resp.text().unwrap();
|
||||||
|
let html = Html::parse_document(&html);
|
||||||
|
Ok(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_image_bytes(url: &str) -> Result<Vec<u8>, reqwest::Error> {
|
||||||
|
let resp = do_throttled_request(url)?;
|
||||||
|
let bytes = resp.bytes()?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
143
src/web/html.rs
Normal file
143
src/web/html.rs
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
use chrono::{DateTime, Local};
|
||||||
|
use log::{debug, error, warn};
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
use crate::{named, ImageBox};
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn extract_page_count(html: Html) -> i32 {
|
||||||
|
let mut page_count: i32 = 0;
|
||||||
|
// select only the links that are page numbers
|
||||||
|
let selector = Selector::parse(r#"a[class="page-numbers"]"#).unwrap();
|
||||||
|
let links = html.select(&selector);
|
||||||
|
// iterate over the links and find the highest page number
|
||||||
|
for link in links {
|
||||||
|
let href = link.value().attr("href").unwrap();
|
||||||
|
// get the last element of the link, which is the page number
|
||||||
|
let raw = href.split("/").collect::<Vec<&str>>();
|
||||||
|
let last = raw.last().unwrap();
|
||||||
|
let last = last.parse::<i32>().unwrap(); // cast the last element to an i32
|
||||||
|
if last > page_count {
|
||||||
|
page_count = last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!(target: "w10s_webscraper", "fn {} - Extracted page count: {}", function_name!(), page_count);
|
||||||
|
page_count
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn extract_image_page_links(html: &Html, old_images: &Vec<ImageBox>) -> Vec<String> {
|
||||||
|
let mut page_links: Vec<String> = Vec::new();
|
||||||
|
let selector = Selector::parse(r#"a[href]"#).unwrap();
|
||||||
|
let links = html.select(&selector);
|
||||||
|
for link in links {
|
||||||
|
// get the href attribute
|
||||||
|
let href: String = link.value().attr("href").unwrap().to_string();
|
||||||
|
// get the hash in the link
|
||||||
|
let hash: String = href
|
||||||
|
.split("/")
|
||||||
|
.collect::<Vec<&str>>()
|
||||||
|
.last()
|
||||||
|
.unwrap()
|
||||||
|
.to_string();
|
||||||
|
// if the link is an image, and it is not a comment link, and it is not a duplicate, then add it to the list
|
||||||
|
let image = old_images.iter().find(|&x| x.hash == hash);
|
||||||
|
if href.contains("/images/")
|
||||||
|
&& href.contains("#respond") == false
|
||||||
|
&& href.contains("#comments") == false
|
||||||
|
&& page_links.contains(&href) == false
|
||||||
|
&& image == None
|
||||||
|
{
|
||||||
|
page_links.push(href.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!(target: "w10s_webscraper", "fn {} - Extracted {} links: {:?}", function_name!(), page_links.len(), page_links);
|
||||||
|
page_links
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract_image_url(html: &Html) -> String {
|
||||||
|
// construct a CSS selector that will grab all of the image tags
|
||||||
|
// This selector is not the html snippets themselves, but rather an object that knows how to select them
|
||||||
|
let selector = Selector::parse("img").unwrap();
|
||||||
|
// use the selector to find all img tags in the document
|
||||||
|
let images = html.select(&selector);
|
||||||
|
// iterate over the elements (references to tags) that the selector found, and assign the correct one to the output variable
|
||||||
|
let mut output: String = String::new();
|
||||||
|
for image in images {
|
||||||
|
// get the raw src attribute of the image tag
|
||||||
|
let src = image.value().attr("src").unwrap();
|
||||||
|
// output the src attribute if it contains "jpg" and "wp-content/uploads/" and "1024x576"
|
||||||
|
if src.contains("jpg") && src.contains("wp-content/uploads/") && src.contains("1024x576") {
|
||||||
|
//println!("{}", src);
|
||||||
|
// split the src attribute into a vector of strings, using the "-" character as the delimiter
|
||||||
|
let tempvec = src.split("-").collect::<Vec<&str>>();
|
||||||
|
// create a new string, and push the first two elements of the vector into it, separated by a "-", and add ".jpg" to the end
|
||||||
|
let mut temp_s = String::new();
|
||||||
|
temp_s.push_str(tempvec[0]);
|
||||||
|
// this keeps the '-' in ".com/wp-content/upl"
|
||||||
|
temp_s.push_str("-");
|
||||||
|
temp_s.push_str(tempvec[1]);
|
||||||
|
temp_s.push_str(".jpg");
|
||||||
|
output = temp_s;
|
||||||
|
//print!("{}", output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn extract_image_date(html: &Html) -> DateTime<Local> {
|
||||||
|
let selector = Selector::parse(r#"span[class="date"]"#).unwrap();
|
||||||
|
let html_dates = html.select(&selector);
|
||||||
|
let mut dates: Vec<String> = Vec::new();
|
||||||
|
for date in html_dates {
|
||||||
|
let date = date.text().collect::<Vec<_>>();
|
||||||
|
dates.push(date[0].to_string())
|
||||||
|
}
|
||||||
|
if dates.len() > 1 {
|
||||||
|
warn!(target: "w10s_webscraper", "{} - More than one date found on page", function_name!());
|
||||||
|
}
|
||||||
|
// date comes out of the html as "2020-01-01", but we need to add the time and timezone to it
|
||||||
|
// so we can parse it into a DateTime object
|
||||||
|
let mut datetime: String = dates[0].to_string();
|
||||||
|
datetime.push_str(" 12:00:00 -0500");
|
||||||
|
let datetime = match DateTime::parse_from_str(&datetime, "%Y-%m-%d %H:%M:%S %z") {
|
||||||
|
Ok(datetime) => datetime.with_timezone(&Local),
|
||||||
|
Err(e) => {
|
||||||
|
error!(
|
||||||
|
target: "w10s_webscraper",
|
||||||
|
"{} - Error parsing date, using local now: {}",
|
||||||
|
function_name!(),
|
||||||
|
e
|
||||||
|
);
|
||||||
|
Local::now()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
datetime
|
||||||
|
}
|
||||||
|
|
||||||
|
#[named]
|
||||||
|
pub fn extract_image_title(html: &Html) -> String {
|
||||||
|
let selector = Selector::parse(r#"title"#).unwrap();
|
||||||
|
let titles = html.select(&selector);
|
||||||
|
let mut output: Vec<String> = Vec::new();
|
||||||
|
for title in titles {
|
||||||
|
let title = title.text().collect::<Vec<_>>();
|
||||||
|
output.push(title[0].to_string())
|
||||||
|
}
|
||||||
|
if output.len() > 1 {
|
||||||
|
warn!(
|
||||||
|
target: "w10s_webscraper",
|
||||||
|
"{} - More than one title found. Using the first one ({})",
|
||||||
|
function_name!(),
|
||||||
|
output[0]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
output[0]
|
||||||
|
.split(" | ")
|
||||||
|
.collect::<Vec<&str>>()
|
||||||
|
.first()
|
||||||
|
.unwrap()
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user