Track SEO active pages percentage over time x

What are active pages? and Why would you want to track them?

An active page is a page which generates at least one SEO visit over a period. If a page has at least one visit it means that its indexed and 'Google" doesn't think it's a useless page. It is a good indicator of the SEO health of a website.
To make things even more interesting we will grab google search console data and compare them to the number of pages submitted in the XML sitemap file.

step 1: Counting active URLs using Search Console data

( see article about grabbing Search Console data)
​
1
library(searchConsoleR)
2
library(googleAuthR)
3
scr_auth()
4
​
5
# Load
6
sc_websites <- list_websites()
7
​
8
# and display the list
9
View(sc_websites)
10
​
11
# pitck the one
12
hostname <- "https://www.rforseo.com/"
13
require(lubridate)
14
​
15
# we want data between now and 2 months ago
16
now <- lubridate::today()-3
17
month(beforedate) <- month(now) - 2
18
day(beforedate) <- days_in_month(beforedate)
19
​
20
# we ask for data with dates and pages
21
​
22
​
23
gsc_all_queries <- search_analytics(hostname,
24
beforedate,now,
25
c("date", "page"), rowLimit = 80000)
26
​
27
​
28
​
Copied!
​
1
library(dplyr)
2
​
3
# we count url with clicks
4
gsc_all_queries_clicks <- gsc_all_queries %>%
5
filter(clicks != 0) %>%
6
group_by(date) %>%
7
tally()
8
​
9
colnames(gsc_all_queries_clicks) <- c("date","clicks")
10
​
11
# we count url with impressions
12
gsc_all_queries_impr <- gsc_all_queries %>%
13
filter(impressions != 0) %>%
14
group_by(date) %>%
15
tally()
16
​
17
colnames(gsc_all_queries_impr) <- c("date","impr")
18
​
19
# we merge those two
20
gsc_all_queries_stats <- merge(gsc_all_queries_clicks, gsc_all_queries_impr)
21
​
22
​
23
​
24
​
Copied!
​
1
# we scrape the url count from github csv
2
urls <- read.csv(url("https://raw.githubusercontent.com/pixgarden/scrape-automation/main/data/xml_url_count.csv"))
3
​
4
# rename columns
5
colnames(urls) <- c("date","urls")
6
​
7
# transform string date into real dates
8
urls$date <- as.Date(urls$date)
9
​
10
# merge with google search console data
11
# because column names match the merge function dont need arguments
12
gsc_all_queries_merged <- merge(gsc_all_queries_stats, urls)
13
​
14
​
15
​
Copied!
​
1
# we count url with no but with impression
2
gsc_all_queries_merged$impr <-gsc_all_queries_merged$impr - gsc_all_queries_merged$clicks
3
# we count url with no impression and no clicks
4
gsc_all_queries_merged$urls <-gsc_all_queries_merged$urls - gsc_all_queries_merged$impr
5
​
6
# rename columns
7
colnames(gsc_all_queries_merged) <- c("date", "url-with-clics","url-only-impr","url-no-impr")
8
​
9
​
Copied!
​
​
1
require(tidyr)
2
test <- gather(gsc_all_queries_merged, urls, count, 2:4)
3
esquisse::esquisser(test)
4
​
5
ggplot(test) +
6
aes(x = date, fill = urls, weight = count) +
7
geom_bar() +
8
scale_fill_hue() +
9
theme_minimal()
10
​
11
library(ggplot2)
12
​
13
ggplot(test) +
14
aes(x = date, fill = urls, weight = count) +
15
geom_bar() +
16
scale_fill_hue() +
17
theme_minimal()
18
​
Copied!