1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
library(leaflet) library(htmltools) library(plyr) library(dplyr) library(tidyr) cities <- function(gcities, geocode, len = as.integer(.Machine$integer.max), FUN = function(x) TRUE) { if (!is.data.frame(gcities)) { gcities <- read.csv(gcities, header = TRUE, sep = ";", as.is = TRUE) } geocode <- read.csv(geocode, header = TRUE, sep = ";", as.is = TRUE, na.strings = "null") d <- merge(gcities, geocode, 1:3) d <- d[order(-d$Count), ] d <- d[!is.na(d$Longitude) & FUN(d), ] m <- leaflet() %>% addTiles() dh <- head(d, len) nrow <- nrow(dh) if (nrow == 0) { print("No cities to render", quote = FALSE) return(m) } color <- c("#FF3300", "#FF9900", "#0033FF", "#666666") # Country Region City Unknown location dh$nc <- case_when( nzchar(dh$City) ~ paste0(htmlEscape(dh$City), color[3]), nzchar(dh$Region) ~ paste0(htmlEscape(dh$Region), color[2]), nzchar(dh$Country) ~ paste0(htmlEscape(dh$Country), color[1]), TRUE ~ paste0(htmlEscape("<UNKNOWN LOCATION>"), color[4])) dh <- separate(dh, "nc", c("Name", "Color"), -7) m <- addCircleMarkers(m, lng = dh$Longitude, lat = dh$Lattitude, color = dh$Color, radius = 5 * log(dh$Count, 10), popup = paste(dh$Name, ",", dh$Count), label = dh$Name) m <- addLegend(m, "bottomright", colors = c(circle_marker_to_legend_color(color[3]), circle_marker_to_legend_color(color[2]), circle_marker_to_legend_color(color[1])), labels = c("City", "Region", "Country"), opacity = 0.5) print(paste(nrow, "cities rendered"), quote = FALSE) return(m) } cities_df <- function(statcounter_log_csv, cities_spells_filter_awk = NULL, warn_suspicious = TRUE, type = "page view") { df <- read.csv(`if`(is.null(cities_spells_filter_awk), statcounter_log_csv, pipe(paste("awk -f", cities_spells_filter_awk, `if`(warn_suspicious, "-v warn_suspicious=yes", NULL), statcounter_log_csv))), header = TRUE, sep = ",", quote = "\"", as.is = TRUE) if (!is.null(type)) { df <- df[df$Type == type, ] } return(df) } gcities <- function(cs) { d <- plyr::count(cs, c("Country", "Region", "City")) names(d)[4] <- "Count" return(d[order(-d$Count), ]) } circle_marker_to_legend_color <- function(color, marker_opacity = 0.3, stroke_opacity = 0.7, stroke_width = "medium") { c <- col2rgb(color) cv <- paste("rgba(", c[1], ", ", c[2], ", ", c[3], ", ", sep = "") return(paste(cv, marker_opacity, "); border-radius: 50%; border: ", stroke_width, " solid ", cv, stroke_opacity, "); box-sizing: border-box", sep = "")) } |
group_cities -f cities_spells_fix.awk StatCounter-Log.csv > gcities.csv group_cities -g -f cities_spells_fix.awk StatCounter-Log.csv > geocode.csvA sample script cities_spells_fix.awk can also be found in the statcounter-utils. This is a manually crafted database of cities and regions synonyms, various transcriptions, misspellings, and apparent errors met in Statcounter logs: the script collapses all variants of a single location to a single value. Files gcities.csv and geocode.csv have schemes with headers Country;Region;City;Count and Country;Region;City;Longitude;Lattitude respectively. In lines 15–16 the data get merged by the first 3 fields (Country, Region, and City) and ordered by field Count from gcities. Then, in line 17, cities with wrong geocode data (more specifically, when field Longitude from geocode is null) get filtered out, and the custom subsetting function FUN is applied. Later, on line 21, top len cities from the survived after all the previous filters set are picked. Basic leaflet construction takes place in line 19. In lines 29–43 cities (as well as regions if there is no city in the record, and countries if there is neither a city nor a region in the record) are marked by circle markers and annotated with popups containing the name and the count. The size of a circle marker is a logarithmic function of the count, whereas its color depends on whether the location is a city or a region or a country. In lines 45–50 a legend with color circles is added to hint a user why circle markers have different colors. Putting circles on a legend is not a trivial task. Function circle_marker_to_legend_color in lines 82–92 accomplishes this using the fact that Leaflet legend’s parameter colors is hackable by supplying a specially crafted HTML code. Selecting cities from a Statcounter log and grouping them by count is a trivial task for R. In other words, there is no need to pass preliminary crafted file gcities.csv, but instead, it makes sense to create a data frame inside R. This makes also possible to apply yet more sophisticated subsettings to the original data because now we are getting access to all the fields in the log directly from R. But remember that we have to apply the cities spells database cities_spells_fix.awk. This seems to be the only complication for function cities_df defined in lines 58–73. This function reads a Statcounter log and returns the desired data frame with grouped cities. Its obscure parameters warn_suspicious and type correspond to whether the awk script should print on the stderr suspicious replacements, and what type of visits to select from the Statcounter log: the default value “page view” is what a user normally expects. A data frame returned from cities_df can be further subset by a custom function as it contains all original data fields. Function gcities (lines 75–80) collapses the data frame fields to the scheme with headers Country;Region;City;Count compatible with input of function cities, and orders the data by count. Let’s run a few examples in an R shell. For all of them, we have to load script cities.r and collect all page view visits from a Statcounter log StatCounter-Log.csv.
source("cities.r") pv <- cities_df("StatCounter-Log.csv", "cities_spells_fix.awk") pvC < gcities(pv)Now let’s render all collected cities on a world map.
cities(pvC, "geocode.csv")Here is how it looks in my browser. Seems to be cluttered by myriads of circle markers (function cities printed [1] 1920 cities rendered). No problem! The map is interactive (however, not in this blog) and can be zoomed (look at the buttons at the top-left corner). The legend at the bottom-right corner shows why the markers have different colors. Let’s put on a map cities from the Moscow region.
pvMosk <- pv[grepl("^(Moskva|Moscow)", pv$Region), ] pvMoskC <- gcities(pvMosk) cities(pvMoskC, "geocode.csv")In the next examples I won’t show maps any longer to not clutter the article. Render cities from Russian Federation only.
cities(pvC, "geocode.csv", FUN = function(x) grepl("Russia", x$Country))Render top 10 cities all over the world with total visits from 10 to 20.
cities(pvC, "geocode.csv", 10, FUN = function(x) x$Count %in% 10:20)Render all the cities with visits in year 2018.
pv2018 <- pv[grepl("^2018", pv$Date.and.Time), ] cities(gcities(pv2018), "geocode.csv")Function cities looks good to me, except it seems to make sense to create geocode data frame separately and pass it to the function like gcities data frame. Perhaps I will implement this in the future. Other improvements may also include using library leaflet.extras for searching of marked cities. (Update: both improvements were implemented in the statcounter-utils.) Now let’s turn to the bar charts of cities. I said that I used Gnuplot for that. But R is capable of making them as well! The following solution (which is the rest of cities.r) makes use of ggplot2 and plotly. As such, lines
library(ggplot2) library(plotly)must be put on the top of the script.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
gcities.compound <- function(cs) { d <- plyr::count(cs, c("Country", "Region", "City")) d$City <- paste(d$Country, "/", d$Region, "/", d$City) names(d)[4] <- "Count" return(d[order(-d$Count), c("City", "Count")]) } gcountries <- function(cs) { d <- plyr::count(cs, c("Country")) names(d)[2] <- "Count" return(d[order(-d$Count), ]) } cities.plot <- function(cs, title = NULL, tops = NULL, width = NULL) { w0 <- 1200 wf <- if (is.null(width)) 1 else w0 / width mf <- wf * (max(cs$Count) / 10000) cw <- 21 to <- (cw * nchar(cs$Count) + 300) * mf ym <- cs[1, ][["Count"]] + to[1] * 2 nrow <- nrow(cs) p <- ggplot(cs, aes(reorder(cs[[1]], cs$Count), cs$Count)) + scale_x_discrete(limits = rev(cs[[1]])) + scale_y_continuous(expand = c(0, 50 * mf, 0, 300 * mf), limits = c(0, NA)) + coord_flip() + geom_col(fill = "darkseagreen", alpha = 1.0) + geom_text(aes(label = cs$Count, y = cs$Count + to, alpha = 0.75), size = 3.4) + theme(axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank() ) + labs(title = title, x = NULL, y = NULL) if (is.null(tops)) { p <- p + annotate("rect", xmin = 0.1, xmax = 0.9, ymin = 0, ymax = ym, fill = alpha("green", 0.0)) } else { cur <- 0 ac <- 0.1 for (i in 1:length(tops)) { if (is.na(tops[i]) || tops[i] > nrow) { tops[i] <- nrow } p <- p + annotate("rect", xmin = nrow - tops[i] + 0.5, xmax = nrow - cur + 0.5, ymin = 0, ymax = ym, fill = alpha("green", ac), color = alpha("firebrick1", 0.4), size = 0.4, linetype = "solid") + annotate("text", x = nrow - tops[i] + 1, y = ym - 300 * mf, color = "blue", label = tops[i], size = 3.0, alpha = 0.5) cur <- tops[i] ac <- ac / 2 if (tops[i] == nrow) { break } } } # Cairo limits linear canvas sizes to 32767 pixels! height <- min(25 * nrow, 32600) p <- ggplotly(p, height = height, width = width) p <- config(p, toImageButtonOptions = list(filename = `if`(is.null(title), "cities", gsub("[^[:alnum:]_\\-]", "_", title)), height = height, width = `if`(is.null(width), w0, width), scale = 1)) print(paste(nrow, "cities plotted"), quote = FALSE) return(p) } |
pvMoskCc <- gcities.compound(pvMosk) cities.plot(pvMoskCc, paste("Moscow region", format(Sys.time(), "(%F %R)")), c(10, 40, NA), 1200)I cut out a piece of the image (the white lacuna across its lower part) to conform to the limitations on image sizes in this blog.