tidyverse · t-kalinowski · Jul 31, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 31, 2025
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # ragnar (development version)
 
+- `ragnar_find_links()` can now parse `sitemap.xml` files. It also gains a 
+  `validate` argument, allowing for sending a `HEAD` request to each link and 
+  filtering out broken links (#83).
+
 # ragnar 0.2.0
 
 * `ragnar_store_create()` gains a new argument: `version`, with default `2`.

diff --git a/R/read-html.R b/R/read-html.R
@@ -351,6 +351,10 @@ ragnar_read_document <- function(
 #'   subset them to return a smaller list. This can be useful for filtering out
 #'   URL's by rules different them `children_only` which only checks the prefix.
 #'
+#' @param validate Default is `FALSE`. If `TRUE` sends a `HEAD` request for each
+#'   link and removes those that are not accessible. Requests are sent in parallel
+#'   using [httr2::req_perform_parallel()].
+#'
 #' @return A character vector of links on the page.
 #' @export
 #'
@@ -373,7 +377,8 @@ ragnar_find_links <- function(
   children_only = TRUE,
   progress = TRUE,
   ...,
-  url_filter = identity
+  url_filter = identity,
+  validate = FALSE
 ) {
   rlang::check_dots_empty()
 
@@ -385,7 +390,10 @@ ragnar_find_links <- function(
   depth <- as.integer(depth)
 
   prefix <- if (isTRUE(children_only)) {
-    url_normalize_stem(xml_url2(x))
+    url <- xml_url2(x)
+    # sitemaps are special cased, so we look at the actual root url.
+    url <- gsub("sitemap\\.xml$", "", url)
+    url_normalize_stem(url)
   } else if (is.character(children_only)) {
     check_string(children_only)
     children_only
@@ -477,6 +485,24 @@ ragnar_find_links <- function(
   out <- out[nzchar(out)]
   out <- unique(sort(url_filter_fn(out)))
 
+  # Validate that we can acess all the URL's
+  if (validate) {
+    resps <- out |>
+      lapply(\(url) url |> httr2::request() |> httr2::req_method("HEAD")) |>
+      httr2::req_perform_parallel(on_error = "continue")
+
+    is_ok <- resps |>
+      map_lgl(\(x) inherits(x, "httr2_response"))
+
+    errors <- map2(out[!is_ok], resps[!is_ok], function(url, err) {
+      list(url = url, err = conditionMessage(err))
+    })
+
+    problems <- c(problems, errors)
+
+    out <- out[is_ok]
+  }
+
   if (length(problems)) {
     cli::cli_warn(
       "Some links could not be followed. Call {.code attr(.Last.value, 'problems')} to see the issues."
@@ -496,9 +522,16 @@ html_find_links <- function(x, absolute = TRUE) {
     x <- read_html2(x)
   }
 
-  links <- x |>
-    xml_find_all(".//a[@href]") |>
-    xml_attr("href", default = "")
+  links <- if (is_sitemap(x)) {
+    x |>
+      xml2::xml_ns_strip() |>
+      xml2::xml_find_all("//urlset/url/loc", flatten = TRUE) |>
+      xml2::xml_text()
+  } else {
+    x |>
+      xml_find_all(".//a[@href]") |>
+      xml_attr("href", default = "")
+  }
 
   # Canonicalize links
   links <- stri_extract_first_regex(links, "^[^#]*") # strip section links
@@ -513,6 +546,13 @@ html_find_links <- function(x, absolute = TRUE) {
   links
 }
 
+is_sitemap <- function(x) {
+  has_sitemap <- x |>
+    xml2::xml_ns_strip() |>
+    xml2::xml_find_first("//sitemapindex | //urlset")
+  length(has_sitemap) > 0
+}
+
 url_host <- function(x, baseurl = NULL) {
   map_chr(x, \(url) {
     # tryCatch to guard against error from, e.g., "mailto:copilot-safety@github.com"

diff --git a/man/ragnar_find_links.Rd b/man/ragnar_find_links.Rd