If not otherwise indicated, you can run each command below in R by creating these objects …
packages <-
c("tidyverse", "Rtsne", "igraph", "tidytext", "sf", "topicmodels",
"knitr")
package <- sample(packages, 1)
numbers <- sample(1:1000, 100)
other_numbers <- sample(1:1000, 100)
factors <- factor(sample(c("red","blue"), 100, replace = T))
other_factors <- factor(sample(c("green","yellow"), 100, replace = T))
dataframe <- data.frame(numbers, other_numbers, factors, other_factors)
matrix <- cbind(numbers, other_numbers)
adjmatrix <- matrix(rbinom(100, 1, prob = .5), ncol = 10)
edglist <- matrix(sample(1:10, 100, replace = T), ncol = 2)
and installing/loading these packages first …
install.packages(packages)
lapply(packages, require, character.only = TRUE)
Launching RStudio
Installing packages
install.packages(packages)Loading packages
library(package) or
require(package)
Running simple calculations
1 + 2 and
2 - 3 and
3 * 4 and
4 / 5
Creating objects
hello <- "hi" or hello = "hi"Using functions
function_name(arguments) [don’t run]
package::function_name(arguments) [don’t run]
Loading data
load(file) [don’t run]
read.csv(file) [don’t run]
Making sense of data (and data vector, matrices, data frames)
Identifying type of R object
class(dataframe)Identifying length of vector
length(numbers)Identifying number of rows (observations)
nrow(dataframe)Identifying number of columns
ncol(matrix)Identifying number of rows and columns
dim(matrix)Getting the first part of matrix-like (rectangular) object
head(dataframe)Getting the column names of a matrix-like (rectangular) object
colnames(dataframe)Accessing variables inside dataframes
dataframe$numbersComputing means
mean(numbers)Creating a ggplot with ggplot2
Using aesthetic mappings
ggplot2::ggplot(dataframe, aes(x = numbers, y = factors))Choosing and using different geoms
Using geom_density
ggplot(dataframe, aes(numbers)) + ggplot2::geom_density()Using geom_histogram
ggplot(dataframe, aes(numbers)) + geom_histogram()Using geom_bar
ggplot(dataframe, aes(x = factors)) + geom_bar()Using geom_point
ggplot(dataframe, aes(x = numbers, y = other_numbers)) + geom_point()Using geom_smooth
ggplot(dataframe, aes(x = numbers, y = other_numbers)) + geom_smooth()Using geom_boxplot
ggplot(dataframe, aes(x = factors, y = numbers)) + geom_boxplot()Using geom_line
ggplot(dataframe, aes(x = numbers, y = other_numbers)) + geom_line()Using scales
ggplot(dataframe, aes(x = numbers, y = other_numbers)) + scale_y_continuous()Using facets
ggplot(dataframe, aes(x = numbers, y = other_numbers)) + facet_grid()
Using pipes (%>%)
dataframe %>% dplyr::mutate(division = numbers / other_numbers)Manipulating dataframes with dplyr
Filtering observations
dplyr::filter(dataframe, numbers > 50, factors == "red")Selecting variables
dplyr::select(dataframe, numbers, factors)Creating new variables
dplyr::mutate(dataframe, division = numbers / other_numbers)Grouping observation
dplyr::group_by(dataframe, factors)Summarising variables
dplyr::summarise(dataframe, n = n())Row-wise operations
dataframe %>% dplyr::rowwise() %>% dplyr::mutate(m = mean(c(numbers, other_numbers)))Using relational operators
3 == 3 and
TRUE != FALSE and
3 < 4 and
4 > 3
Creating variables with ifelse
ifelse(factors == "red", 1, 0)Subsetting variables
numbers[numbers > 50]Subsetting data frames
dataframe[1,] or
dataframe[,1] or
Reporting with R Markdown
knitr::kable(dataframe, caption = "My caption")
knitr::kable(dataframe, caption = "My caption")
Performing principal component analysis
prcomp(matrix, scale = TRUE)Performing t-SNE analysis
Rtsne::Rtsne(matrix, perplexity = 10)Creating frequency tables
table(factors)Creating tables of proportions
prop.table(table(factors))Understanding missing values
is.na(factors) and
is.null(numbers)
Handling missing values
table(factors, exclude = NULL) and
na.omit(factors)
Creating two-way frequency tables
table(factors, other_factors)Creating two-way tables of proportions
prop.table(table(factors, other_factors))Computing descriptive statistics
Mean
mean(numbers)Median
median(numbers)Standard deviation
sd(numbers)Variance
var(numbers)Computing correlations
cor(numbers, other_numbers)Taking a random sample
sample(numbers, size = 10)Taking a random sample from a normal distribution
rnorm(10)Computing probability of normal random variables
pnorm(-1.96)Using for loops
for(i in 1:10) {print(i)}Computing absolute value
abs(-10)Computing 95% confidence interval for sample means
mean(numbers) - 1.96 * sqrt(var(numbers) / length(numbers)) and
mean(numbers) + 1.96 * sqrt(var(numbers) / length(numbers))
Computing 95% confidence interval for difference-in-means estimators
mean(numbers) - mean(other_numbers) - 1.96 * sqrt(var(numbers) / length(numbers) + var(other_numbers) / length(other_numbers)) and
mean(numbers) - mean(other_numbers) + 1.96 * sqrt(var(numbers) / length(numbers) + var(other_numbers) / length(other_numbers))
Hypothesis testing with the difference-in-means estimators
z_obs <- (mean(numbers) - mean(other_numbers)) / sqrt(var(numbers) / length(numbers) + var(other_numbers) / length(other_numbers)) and
2 * pnorm(-abs(z_obs))
Fitting simple linear regression models
fit <- lm(numbers ~ other_numbers, data = dataframe)Using natural logarithmic transformations
* `log(numbers)` or
* `lm(log(numbers) ~ other_numbers, data = dataframe)` or
Hypothesis testing with estimated regression coefficients
summary(fit)$coefReporting linear model results
summary(fit)Fitting multiple linear regression models
lm(numbers ~ other_numbers + factors + other_factors)Computing confidence interval for predicted outcomes
predict(fit, newdata = data.frame(other_numbers = 100), interval = "confidence")Fitting logistic regression models
glm(factors ~ numbers + other_numbers, family = 'binomial')Analysing network data
Creating networks
g <- igraph::graph_from_adjacency_matrix(adjmatrix)
g <- igraph::graph_from_edgelist(edglist)
Visualising networks
plot(g)Computing network statistics
Size (global)
igraph::vcount(g) and
igraph::ecount(g)
Degree (local)
igraph::degree(g)Closeness (local)
igraph::closeness(g)Betweenness (local)
igraph::betweenness(g)Computing network communities
igraph::cluster_optimal(g)Analysing spatial data
Loading spatial data
data.sf <- sf::read_sf(file) [don’t run]Visualising spatial data
leaflet::leaflet(sf) + leaflet::addTiles() [don’t run] or
ggplot2::ggplot(sf) + geom_sf() [don’t run]
Manipulating spatial data
sf::st_buffer(sf, dist = 150) [don’t run]
sf::st_intersects(sf) [don’t run]
Getting spatial features from OpenStreetMap
query <- osmdata::opq(bbox = c(13.30,45.89,13.31,45.91))
osm_feature <- osmdata::add_osm_feature(query, key = 'building')
sf <- osmdata::osmdata_sf(osm_feature)
Analysing text data
Tidying text data
Analysing word frequencies
text_df %>% unnest_tokens(word, text) [don’t run]
tidy_df %>% anti_join(stop_words) [don’t run]
tidy_books %>% dplyr::inner_join(tidytext::get_sentiments("bing")) [don’t run]
Analysing relationships between words
Identifying topics in texts
topicmodels::LDA(dtm, k = 10) [don’t run]