An active page is a page which generates at least one SEO visit over a period. If a page has at least one visit it means that its indexed and 'Google" doesn't think it's a useless page. It is a good indicator of the SEO health of a website.
To make things even more interesting we will grab google search console data and compare them to the number of pages submitted in the XML sitemap file.
( see article about grabbing Search Console data)
library(searchConsoleR)library(googleAuthR)scr_auth()# Loadsc_websites <- list_websites()# and display the listView(sc_websites)# pitck the onehostname <- "https://www.rforseo.com/"require(lubridate)# we want data between now and 2 months agonow <- lubridate::today()-3month(beforedate) <- month(now) - 2day(beforedate) <- days_in_month(beforedate)# we ask for data with dates and pagesgsc_all_queries <- search_analytics(hostname,beforedate,now,c("date", "page"), rowLimit = 80000)
library(dplyr)# we count url with clicksgsc_all_queries_clicks <- gsc_all_queries %>%filter(clicks != 0) %>%group_by(date) %>%tally()colnames(gsc_all_queries_clicks) <- c("date","clicks")# we count url with impressionsgsc_all_queries_impr <- gsc_all_queries %>%filter(impressions != 0) %>%group_by(date) %>%tally()colnames(gsc_all_queries_impr) <- c("date","impr")# we merge those twogsc_all_queries_stats <- merge(gsc_all_queries_clicks, gsc_all_queries_impr)
# we scrape the url count from github csvurls <- read.csv(url("https://raw.githubusercontent.com/pixgarden/scrape-automation/main/data/xml_url_count.csv"))# rename columnscolnames(urls) <- c("date","urls")# transform string date into real datesurls$date <- as.Date(urls$date)# merge with google search console data# because column names match the merge function dont need argumentsgsc_all_queries_merged <- merge(gsc_all_queries_stats, urls)
# we count url with no but with impressiongsc_all_queries_merged$impr <-gsc_all_queries_merged$impr - gsc_all_queries_merged$clicks# we count url with no impression and no clicksgsc_all_queries_merged$urls <-gsc_all_queries_merged$urls - gsc_all_queries_merged$impr# rename columnscolnames(gsc_all_queries_merged) <- c("date", "url-with-clics","url-only-impr","url-no-impr")
require(tidyr)test <- gather(gsc_all_queries_merged, urls, count, 2:4)esquisse::esquisser(test)ggplot(test) +aes(x = date, fill = urls, weight = count) +geom_bar() +scale_fill_hue() +theme_minimal()library(ggplot2)ggplot(test) +aes(x = date, fill = urls, weight = count) +geom_bar() +scale_fill_hue() +theme_minimal()