From d4ede50ad549986626e50579111a4a9f1794fb3f Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Nov 2024 22:36:09 +0100 Subject: [PATCH 1/2] Add support for custom headers when checking the initial inputs --- examples/collect_links/collect_links.rs | 3 ++ lychee-bin/src/options.rs | 13 +++++- lychee-lib/src/collector.rs | 28 ++++++++++-- lychee-lib/src/types/input.rs | 59 +++++++++++++++---------- 4 files changed, 74 insertions(+), 29 deletions(-) diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index 4a86924c56..d3294880d9 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -1,3 +1,4 @@ +use http::HeaderMap; use lychee_lib::{Collector, Input, InputSource, Result}; use reqwest::Url; use std::path::PathBuf; @@ -13,11 +14,13 @@ async fn main() -> Result<()> { )), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, Input { source: InputSource::FsPath(PathBuf::from("fixtures/TEST.md")), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, ]; diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 60c259d2ea..0095592b6c 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,5 +1,5 @@ use crate::archive::Archive; -use crate::parse::parse_base; +use crate::parse::{parse_base, parse_headers}; use crate::verbosity::Verbosity; use anyhow::{anyhow, Context, Error, Result}; use clap::builder::PossibleValuesParser; @@ -195,9 +195,18 @@ impl LycheeOptions { } else { Some(self.config.exclude_path.clone()) }; + let headers = parse_headers(&self.config.header)?; self.raw_inputs .iter() - .map(|s| Input::new(s, None, self.config.glob_ignore_case, excluded.clone())) + .map(|s| { + Input::new( + s, + None, + self.config.glob_ignore_case, + excluded.clone(), + headers.clone(), + ) + }) .collect::>() .context("Cannot parse inputs from arguments") } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 955bdd24e7..e55fec22b0 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -136,7 +136,7 @@ impl Collector { mod tests { use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write}; - use http::StatusCode; + use http::{HeaderMap, StatusCode}; use reqwest::Url; use super::*; @@ -173,7 +173,13 @@ mod tests { // Treat as plaintext file (no extension) let file_path = temp_dir.path().join("README"); let _file = File::create(&file_path).unwrap(); - let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?; + let input = Input::new( + &file_path.as_path().display().to_string(), + None, + true, + None, + HeaderMap::new(), + )?; let contents: Vec<_> = input .get_contents(true, true, true) .collect::>() @@ -186,7 +192,7 @@ mod tests { #[tokio::test] async fn test_url_without_extension_is_html() -> Result<()> { - let input = Input::new("https://example.com/", None, true, None)?; + let input = Input::new("https://example.com/", None, true, None, HeaderMap::new())?; let contents: Vec<_> = input .get_contents(true, true, true) .collect::>() @@ -221,6 +227,7 @@ mod tests { source: InputSource::String(TEST_STRING.to_owned()), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, Input { source: InputSource::RemoteUrl(Box::new( @@ -230,11 +237,13 @@ mod tests { )), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, Input { source: InputSource::FsPath(file_path), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, Input { source: InputSource::FsGlob { @@ -243,6 +252,7 @@ mod tests { }, file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }, ]; @@ -267,7 +277,8 @@ mod tests { let input = Input { source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()), file_type_hint: Some(FileType::Markdown), - excluded_paths: None, + excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; @@ -294,6 +305,7 @@ mod tests { ), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; @@ -323,6 +335,7 @@ mod tests { ), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; @@ -349,6 +362,7 @@ mod tests { ), file_type_hint: Some(FileType::Markdown), excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; @@ -372,6 +386,7 @@ mod tests { source: InputSource::String(input), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; @@ -404,6 +419,7 @@ mod tests { source: InputSource::RemoteUrl(Box::new(server_uri.clone())), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], None).await; @@ -424,6 +440,7 @@ mod tests { ), file_type_hint: None, excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], None).await; @@ -454,6 +471,7 @@ mod tests { )), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }, Input { source: InputSource::RemoteUrl(Box::new( @@ -465,6 +483,7 @@ mod tests { )), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }, ]; @@ -500,6 +519,7 @@ mod tests { ), file_type_hint: Some(FileType::Html), excluded_paths: None, + headers: HeaderMap::new(), }; let links = collect(vec![input], Some(base)).await; diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index c32be7feb8..21812d2f96 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,6 +3,7 @@ use crate::{utils, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; +use http::HeaderMap; use ignore::WalkBuilder; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -105,7 +106,7 @@ impl Display for InputSource { } /// Lychee Input with optional file hint for parsing -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct Input { /// Origin of input pub source: InputSource, @@ -113,6 +114,8 @@ pub struct Input { pub file_type_hint: Option, /// Excluded paths that will be skipped when reading content pub excluded_paths: Option>, + /// Custom headers to be used when fetching remote URLs + pub headers: reqwest::header::HeaderMap, } impl Input { @@ -129,6 +132,7 @@ impl Input { file_type_hint: Option, glob_ignore_case: bool, excluded_paths: Option>, + headers: reqwest::header::HeaderMap, ) -> Result { let source = if value == STDIN { InputSource::Stdin @@ -194,9 +198,20 @@ impl Input { source, file_type_hint, excluded_paths, + headers, }) } + /// Convenience constructor with sane defaults + /// + /// # Errors + /// + /// Returns an error if the input does not exist (i.e. invalid path) + /// and the input cannot be parsed as a URL. + pub fn from_value(value: &str) -> Result { + Self::new(value, None, false, None, HeaderMap::new()) + } + /// Retrieve the contents from the input /// /// # Errors @@ -424,6 +439,8 @@ fn is_excluded_path(excluded_paths: &[PathBuf], path: &PathBuf) -> bool { #[cfg(test)] mod tests { + use http::HeaderMap; + use super::*; #[test] @@ -434,14 +451,15 @@ mod tests { assert!(path.exists()); assert!(path.is_relative()); - let input = Input::new(test_file, None, false, None); + let input = Input::new(test_file, None, false, None, HeaderMap::new()); assert!(input.is_ok()); assert!(matches!( input, Ok(Input { source: InputSource::FsPath(PathBuf { .. }), file_type_hint: None, - excluded_paths: None + excluded_paths: None, + headers: _, }) )); } @@ -454,7 +472,7 @@ mod tests { assert!(!path.exists()); assert!(path.is_relative()); - let input = Input::new(test_file, None, false, None); + let input = Input::from_value(test_file); assert!(input.is_err()); assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. })))); } @@ -497,7 +515,7 @@ mod tests { #[test] fn test_url_without_scheme() { - let input = Input::new("example.com", None, false, None); + let input = Input::from_value("example.com"); assert_eq!( input.unwrap().source.to_string(), String::from("http://example.com/") @@ -508,7 +526,7 @@ mod tests { #[cfg(windows)] #[test] fn test_windows_style_filepath_not_existing() { - let input = Input::new("C:\\example\\project\\here", None, false, None); + let input = Input::from_value("C:\\example\\project\\here"); assert!(input.is_err()); let input = input.unwrap_err(); @@ -528,7 +546,7 @@ mod tests { let dir = temp_dir(); let file = NamedTempFile::new_in(dir).unwrap(); let path = file.path(); - let input = Input::new(path.to_str().unwrap(), None, false, None).unwrap(); + let input = Input::from_value(path.to_str().unwrap()).unwrap(); match input.source { InputSource::FsPath(_) => (), @@ -540,33 +558,28 @@ mod tests { fn test_url_scheme_check_succeeding() { // Valid http and https URLs assert!(matches!( - Input::new("http://example.com", None, false, None), + Input::from_value("http://example.com"), Ok(Input { source: InputSource::RemoteUrl(_), .. }) )); assert!(matches!( - Input::new("https://example.com", None, false, None), + Input::from_value("https://example.com"), Ok(Input { source: InputSource::RemoteUrl(_), .. }) )); assert!(matches!( - Input::new( - "http://subdomain.example.com/path?query=value", - None, - false, - None - ), + Input::from_value("http://subdomain.example.com/path?query=value",), Ok(Input { source: InputSource::RemoteUrl(_), .. }) )); assert!(matches!( - Input::new("https://example.com:8080", None, false, None), + Input::from_value("https://example.com:8080"), Ok(Input { source: InputSource::RemoteUrl(_), .. @@ -578,19 +591,19 @@ mod tests { fn test_url_scheme_check_failing() { // Invalid schemes assert!(matches!( - Input::new("ftp://example.com", None, false, None), + Input::from_value("ftp://example.com"), Err(ErrorKind::InvalidFile(_)) )); assert!(matches!( - Input::new("httpx://example.com", None, false, None), + Input::from_value("httpx://example.com"), Err(ErrorKind::InvalidFile(_)) )); assert!(matches!( - Input::new("file:///path/to/file", None, false, None), + Input::from_value("file:///path/to/file"), Err(ErrorKind::InvalidFile(_)) )); assert!(matches!( - Input::new("mailto:user@example.com", None, false, None), + Input::from_value("mailto:user@example.com"), Err(ErrorKind::InvalidFile(_)) )); } @@ -599,11 +612,11 @@ mod tests { fn test_non_url_inputs() { // Non-URL inputs assert!(matches!( - Input::new("./local/path", None, false, None), + Input::from_value("./local/path"), Err(ErrorKind::InvalidFile(_)) )); assert!(matches!( - Input::new("*.md", None, false, None), + Input::from_value("*.md"), Ok(Input { source: InputSource::FsGlob { .. }, .. @@ -611,7 +624,7 @@ mod tests { )); // Assuming the current directory exists assert!(matches!( - Input::new(".", None, false, None), + Input::from_value("."), Ok(Input { source: InputSource::FsPath(_), .. From cd2833bb7de1a617ed0284e8a508d2df71742a9e Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 8 Nov 2024 05:26:00 +0100 Subject: [PATCH 2/2] wip --- Cargo.lock | 11 +++++++++++ lychee-bin/Cargo.toml | 1 + lychee-bin/src/options.rs | 15 +++++++++------ lychee-bin/src/parse.rs | 26 +++++++++++++++++++------- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9293c9adb7..013f4c01df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1909,6 +1909,16 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-serde" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f056c8559e3757392c8d091e796416e4649d8e49e88b8d76df6c002f05027fd" +dependencies = [ + "http 1.1.0", + "serde", +] + [[package]] name = "httparse" version = "1.8.0" @@ -2543,6 +2553,7 @@ dependencies = [ "futures", "headers", "http 1.1.0", + "http-serde", "humantime", "humantime-serde", "indicatif", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 0b0ce8709e..6d54723bd4 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -55,6 +55,7 @@ tokio = { version = "1.41.0", features = ["full"] } tokio-stream = "0.1.16" toml = "0.8.19" url = "2.5.2" +http-serde = "2.1.1" [dev-dependencies] assert_cmd = "2.0.16" diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 0095592b6c..845e606a2a 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -5,6 +5,7 @@ use anyhow::{anyhow, Context, Error, Result}; use clap::builder::PossibleValuesParser; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; +use http::HeaderMap; use lychee_lib::{ Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, @@ -195,7 +196,6 @@ impl LycheeOptions { } else { Some(self.config.exclude_path.clone()) }; - let headers = parse_headers(&self.config.header)?; self.raw_inputs .iter() .map(|s| { @@ -204,7 +204,7 @@ impl LycheeOptions { None, self.config.glob_ignore_case, excluded.clone(), - headers.clone(), + self.config.header.clone(), ) }) .collect::>() @@ -399,10 +399,13 @@ Example: --fallback-extensions html,htm,php,asp,aspx,jsp,cgi" )] pub(crate) fallback_extensions: Vec, - /// Custom request header - #[arg(long)] - #[serde(default)] - pub(crate) header: Vec, + /// Set custom header for requests + #[clap( + long = "header", + value_parser = parse_header, + number_of_values = 1 + )] + pub header: Vec, /// A List of accepted status codes for valid links #[arg( diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index b5a4cca732..699a2d92cb 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -1,5 +1,6 @@ use anyhow::{anyhow, Context, Result}; use headers::{HeaderMap, HeaderName}; +use http::HeaderValue; use lychee_lib::{remap::Remaps, Base}; use std::time::Duration; @@ -20,14 +21,25 @@ pub(crate) const fn parse_duration_secs(secs: usize) -> Duration { Duration::from_secs(secs as u64) } -/// Parse HTTP headers into a `HeaderMap` -pub(crate) fn parse_headers>(headers: &[T]) -> Result { - let mut out = HeaderMap::new(); - for header in headers { - let (key, val) = read_header(header.as_ref())?; - out.insert(HeaderName::from_bytes(key.as_bytes())?, val.parse()?); +/// Parse a header given in a string format into a `HeaderMap` +/// +/// Headers are expected to be in format "key:value". +fn parse_header(header: &str) -> Result { + let header: Vec<&str> = header.split(':').collect(); + if header.len() != 2 { + return Err("Wrong header format (see --help for format)".to_string()); } - Ok(out) + + let (header_name, header_value) = (header[0], header[1]); + + let hn = HeaderName::from_lowercase(header_name.trim().to_lowercase().as_bytes()) + .map_err(|e| e.to_string())?; + + let hv = HeaderValue::from_str(header_value.trim()).map_err(|e| e.to_string())?; + + let mut map = HeaderMap::new(); + map.insert(hn, hv); + Ok(map) } /// Parse URI remaps