diff --git a/NEWS.md b/NEWS.md index 4b407de..c9ad7a6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # ragnar (development version) +- `ragnar_find_links()` can now parse `sitemap.xml` files. It also gains a + `validate` argument, allowing for sending a `HEAD` request to each link and + filtering out broken links (#83). + # ragnar 0.2.0 * `ragnar_store_create()` gains a new argument: `version`, with default `2`. diff --git a/R/read-html.R b/R/read-html.R index db9598c..3a41312 100644 --- a/R/read-html.R +++ b/R/read-html.R @@ -351,6 +351,10 @@ ragnar_read_document <- function( #' subset them to return a smaller list. This can be useful for filtering out #' URL's by rules different them `children_only` which only checks the prefix. #' +#' @param validate Default is `FALSE`. If `TRUE` sends a `HEAD` request for each +#' link and removes those that are not accessible. Requests are sent in parallel +#' using [httr2::req_perform_parallel()]. +#' #' @return A character vector of links on the page. #' @export #' @@ -373,7 +377,8 @@ ragnar_find_links <- function( children_only = TRUE, progress = TRUE, ..., - url_filter = identity + url_filter = identity, + validate = FALSE ) { rlang::check_dots_empty() @@ -385,7 +390,10 @@ ragnar_find_links <- function( depth <- as.integer(depth) prefix <- if (isTRUE(children_only)) { - url_normalize_stem(xml_url2(x)) + url <- xml_url2(x) + # sitemaps are special cased, so we look at the actual root url. + url <- gsub("sitemap\\.xml$", "", url) + url_normalize_stem(url) } else if (is.character(children_only)) { check_string(children_only) children_only @@ -477,6 +485,24 @@ ragnar_find_links <- function( out <- out[nzchar(out)] out <- unique(sort(url_filter_fn(out))) + # Validate that we can acess all the URL's + if (validate) { + resps <- out |> + lapply(\(url) url |> httr2::request() |> httr2::req_method("HEAD")) |> + httr2::req_perform_parallel(on_error = "continue") + + is_ok <- resps |> + map_lgl(\(x) inherits(x, "httr2_response")) + + errors <- map2(out[!is_ok], resps[!is_ok], function(url, err) { + list(url = url, err = conditionMessage(err)) + }) + + problems <- c(problems, errors) + + out <- out[is_ok] + } + if (length(problems)) { cli::cli_warn( "Some links could not be followed. Call {.code attr(.Last.value, 'problems')} to see the issues." @@ -496,9 +522,16 @@ html_find_links <- function(x, absolute = TRUE) { x <- read_html2(x) } - links <- x |> - xml_find_all(".//a[@href]") |> - xml_attr("href", default = "") + links <- if (is_sitemap(x)) { + x |> + xml2::xml_ns_strip() |> + xml2::xml_find_all("//urlset/url/loc", flatten = TRUE) |> + xml2::xml_text() + } else { + x |> + xml_find_all(".//a[@href]") |> + xml_attr("href", default = "") + } # Canonicalize links links <- stri_extract_first_regex(links, "^[^#]*") # strip section links @@ -513,6 +546,13 @@ html_find_links <- function(x, absolute = TRUE) { links } +is_sitemap <- function(x) { + has_sitemap <- x |> + xml2::xml_ns_strip() |> + xml2::xml_find_first("//sitemapindex | //urlset") + length(has_sitemap) > 0 +} + url_host <- function(x, baseurl = NULL) { map_chr(x, \(url) { # tryCatch to guard against error from, e.g., "mailto:copilot-safety@github.com" diff --git a/man/ragnar_find_links.Rd b/man/ragnar_find_links.Rd index 69af68c..d6fb638 100644 --- a/man/ragnar_find_links.Rd +++ b/man/ragnar_find_links.Rd @@ -10,7 +10,8 @@ ragnar_find_links( children_only = TRUE, progress = TRUE, ..., - url_filter = identity + url_filter = identity, + validate = FALSE ) } \arguments{ @@ -33,6 +34,10 @@ when \code{depth > 0}.} \item{url_filter}{A function that takes a character vector of URL's and may subset them to return a smaller list. This can be useful for filtering out URL's by rules different them \code{children_only} which only checks the prefix.} + +\item{validate}{Default is \code{FALSE}. If \code{TRUE} sends a \code{HEAD} request for each +link and removes those that are not accessible. Requests are sent in parallel +using \code{\link[httr2:req_perform_parallel]{httr2::req_perform_parallel()}}.} } \value{ A character vector of links on the page.