URLs categorization
1
# default category
2
INDEX$UrlCat <- "Not match"
3
4
# create category name
5
category_name <- c("Category", "Dates", "Page Auteur", "Page d'accueil")
6
7
# create category regex, must be the same length
8
category_regex <- c("category", "2019", "author","example\.com.\/quot;)
9
10
# categorize
11
for(i in 1:length(category_name)){
12
cat(".")
13
INDEX$UrlCat <- ifelse(grepl(category_regex[i], INDEX$Url, ignore.case = T),
14
category_name[i], INDEX$UrlCat)
15
}
16
17
# debug
18
View(INDEX)
Copied!
Copy link