#To get the longitude, latitude for the cities in question, you will need to install nominatim for Open Street Maps #devtools::install_github("hrbrmstr/nominatim") # You will ned an OSM API key which you can get for free from Map Quest # Then you just populate the data frame like so calgary.rug.cities = c("Calgary", "Vancouver", "Edmonton", "Yellowknife", "Winnipeg", "Utrecht", "Buenos Aires", "Ramallah", "Doha", "Berlin") calgary.rug.countries = c("CA", "CA", "CA", "CA", "CA", "NL", "AR", "PS", "QA", "DE") #You need to know the ISO country codes for the countries in question, I suppose calgary.rug.solar.df = data.frame(matrix(nrow = length(calgary.rug.cities), ncol = 2)) for(i in 1:length(calgary.rug.cities)) { place_holder = nominatim::osm_geocode(query = calgary.rug.cities[i], country_codes = calgary.rug.countries[i], key = myosmapikey) calgary.rug.solar.df[i,1] = place_holder$lon calgary.rug.solar.df[i,2] = place_holder$lat print("One more city") } #We will need 13 columns of insolation data, one for each month and one for the annual values calgary.rug.solar.monthly = data.frame(matrix(nrow = nrow(calgary.rug.solar.df), ncol = 13)) for(i in 1:nrow(calgary.rug.solar.monthly)) { request_solar_ghi = nasapower::get_power(community = "SSE", pars = "ALLSKY_SFC_SW_DWN", temporal_average = "CLIMATOLOGY", lonlat = c(calgary.rug.solar.df[i,1], calgary.rug.solar.df[i,2])) #To see how this next line works, you really have to know what the structure of the returned response from NASAPOWER calgary.rug.solar.monthly[i,] = data.frame(request_solar_ghi[which(request_solar_ghi$PARAMETER == "ALLSKY_SFC_SW_DWN"),][,4:16]) } colnames(calgary.rug.solar.monthly) = colnames(data.frame(request_solar_ghi[,4:16])) #Check this to see what it is calgary.rug.solar.monthly rownames(calgary.rug.solar.monthly) = calgary.rug.cities calgary.rug.solar.df = cbind(calgary.rug.solar.df, calgary.rug.solar.monthly$ANN) calgary.rug.solar.df row.names(calgary.rug.solar.df) = calgary.rug.cities colnames(calgary.rug.solar.df) = c("LON", "LAT", "Annual GHI") calgary.rug.solar.df #The annual value is given as a "average day," so we need to fix it calgary.rug.solar.df$`Annual GHI` = calgary.rug.solar.df$`Annual GHI`*365 calgary.rug.solar.df calgary.rug.cities.diffuse = vector(length = length(calgary.rug.cities)) for(i in 1:length(calgary.rug.cities.diffuse)) { diffuse_request = nasapower::get_power(community = "SSE", pars = "DIFF", temporal_average = "CLIMATOLOGY", lonlat = c(calgary.rug.cities.for.presenting$LON[i], calgary.rug.cities.for.presenting$LAT[i])) annual_diffuse = data.frame(diffuse_request)$ANN calgary.rug.cities.diffuse[i] = annual_diffuse } calgary.rug.cities.dnr = vector(length = length(calgary.rug.cities)) for(i in 1:length(calgary.rug.cities)) { dnr_request = nasapower::get_power(community = "SSE", pars = "DNR", temporal_average = "CLIMATOLOGY", lonlat = c(calgary.rug.cities.for.presenting$LON[i], calgary.rug.cities.for.presenting$LAT[i])) annual_dnr = data.frame(dnr_request)$ANN calgary.rug.cities.dnr[i] = annual_dnr } calgary.rug.cities.diffuse[is.na(calgary.rug.cities.diffuse)] <- 0 calgary.cities.diff_to_dnr = calgary.rug.cities.diffuse/calgary.rug.cities.dnr #Yelloknife is a special case it seems; just not a lot of sunlight in some parts of the year calgary.rug.yellowknife_diffuse1 = data.frame(nasapower::get_power(community = "SSE", pars = "DIFF", temporal_average = "CLIMATOLOGY", lonlat = c(calgary.rug.cities.for.presenting$LON[4], calgary.rug.cities.for.presenting$LAT[4]))) calgary.rug.yellowknife_diffuse = sum((calgary.rug.yellowknife_diffuse1)[,4:16], na.rm = TRUE) calgary.rug.yellowknife_dnr1 = data.frame(nasapower::get_power(community = "SSE", pars = "DNR", temporal_average = "CLIMATOLOGY", lonlat = c(calgary.rug.cities.for.presenting$LON[4], calgary.rug.cities.for.presenting$LAT[4]))) calgary.rug.yellowknife_dnr = sum(calgary.rug.yellowknife_dnr1[,4:16], na.rm = TRUE) calgary.rug.yellowknife_diff_to_dnr = calgary.rug.yellowknife_diffuse/calgary.rug.yellowknife_dnr calgary.cities.diff_to_dnr[4] = calgary.rug.yellowknife_diff_to_dnr calgary.rug.cities.for.presenting$DiffDNR = calgary.cities.diff_to_dnr #Before going any further, please have a quick check to see that the different data frames are reasonable before trying to plot them myPaletter = colorRampPalette(rev(brewer.pal(11, "RdBu"))) scaled_map = scale_color_gradientn(colors = myPaletter(25), limits = c(min(calgary.rug.solar.df$DiffDNR2), max(calgary.rug.solar.df$DiffDNR2)), name = "Diffuse-to-DNR \n %-age") #The map for diffuse to DNI ggplot2::ggplot(data = world_map) + geom_sf() + geom_point(data = calgary.rug.solar.df[1:10,], mapping = aes(x = LON, y = LAT, color = (DiffDNR2)), size = 5) + coord_sf(xlim = c(min(calgary.rug.cities.for.presenting$LON)-10, max(calgary.rug.cities.for.presenting$LON) + 10), ylim = c(min(calgary.rug.cities.for.presenting$LAT)-10, max(calgary.rug.cities.for.presenting$LAT)+10), expand = F) + scaled_map + ggtitle("Diffuse to DNR annually \n selected cities") +theme(panel.background = element_rect(fill = "lightblue", color = "blue", size = 0.5))
Visualization graphs, Huge information is being collected through data in the business world, we must need a tool to picture of that data so we can interpret it and make decisions on time.
Data visualization provides a clear idea of what the information means by giving it visual context through maps or graphs.
Visualization allows humans to identify trends, patterns, and anomalies from large datasets.
In this tutorial we are going to explain one of the new packages ggside with ggplot.
ggside mainly used for creating customizable side plots with the help of ggplot.
If you want to install the package you can directly install from gitgub.
library(devtools) devtools::install_github("jtlandis/ggside")
library(ggside) library(tidyverse) library(tidyquant)
geom_xside* allows you to place geometries along the x-axis and geom_yside* allows placement along the y-axis.
All of the geom_*side* functions provide a variation on the color aesthetics color/fill.
The variants are named xcolour and xfill or ycolour and yfill for their respective xside or yside geoms.
The following geoms are currently available to use right away from the ggside package.
In this tutorial, we are exploring only GeomDensity and GeomBoxplot.
geom_xsidedensity() function will create a density plot in top parallel to x-axis of the ggplot. In the same way, geom_ysidedensity () function will create a density plot parallel to the y-axis.
p2<-mpg %>% ggplot(aes(hwy, cty, color = class)) + geom_point(size = 2, alpha = 0.3) + geom_smooth(aes(color = NULL), se=TRUE) + geom_xsidedensity( aes( y = after_stat(density), fill = class ), alpha = 0.5, size = 1, position = "stack" ) + geom_ysidedensity( aes( x = after_stat(density), fill = class ), alpha = 0.5, size = 1, position = "stack" ) + scale_color_tq() + scale_fill_tq() + theme_tq() + labs(title = "Fuel Economy by Vehicle Type" , subtitle = "Density Plot", x = "Highway", y = "City") + theme( ggside.panel.scale.x = 0.4, ggside.panel.scale.y = 0.4 ) plot(p2)
p2 + ggside(x.pos = "bottom", y.pos = "left") + labs(title = "FacetNull", subtitle = "Xside placed bottom, Yside placed left")
p2 + facet_wrap(drv~fl) + labs(title = "FacetWrap", subtitle = "Collapsing X side Panels") + ggside(collapse = "x")
p2 + facet_grid(drv~fl, space = "free", scales = "free") + labs(title = "FacetGrid", subtitle = "Collapsing All Side Panels") + ggside(collapse = "all")
mpg %>% ggplot(aes(x = cty, y = hwy, color = class)) + geom_point() + geom_smooth(aes(color = NULL)) + geom_xsideboxplot( alpha = 0.5, size = 1 ) + scale_color_tq() + scale_fill_tq() + theme_tq() + facet_grid(cols = vars(cyl), scales = "free_x") + labs( title = "Fuel Economy by Engine Size")
Data visualization can help by delivering data in the most efficient way possible. Better visualization provides to make better decisions and predictions.
The post Visualization Graphs-ggside with ggplot appeared first on finnstats.
This is our 101’st blog post here on Learning Machines and we have prepared something very special for you!
Oftentimes the different concepts of data science, namely artificial intelligence (AI), machine learning (ML), and deep learning (DL) are confused… so we asked the most advanced AI in the world, OpenAI GPT-3, to write a guest post for us to provide some clarification on their definitions and how they are related.
We are most delighted to present this very impressive (and only slightly redacted) essay to you – enjoy!
Artificial intelligence (AI), machine learning (ML), and deep learning (DL) are related concepts that are often used interchangeably. They are also three distinct and different concepts. In this blog post, we will define artificial intelligence, machine learning, and deep learning and explain why they are all different and how they are related.
AI is a broad and complex concept that has been around for decades. AI is used to describe a concept or a system that mimics the cognitive functions of the human brain. It can be used to describe a situation where machines can act or behave in a way that mimics human behavior. AI is often used to describe a system that can learn from experience, can use knowledge to perform tasks, to reason, and to make decisions.
There are many different types of AI. For example, there are expert systems, neural networks, and fuzzy logic. In this blog post, we are going to focus on the different types of machine learning. A machine learning model is an AI system that can learn from a dataset and can make predictions or decisions based on the data (see also So, what is AI really?).
Machine learning is a subset of AI and is a method for algorithms to learn from data. It can be used to build models that can predict future behavior based on past experience. Machine learning is used to analyze large datasets and to find patterns in the data. An example of a machine learning model is a spam filter that learns to differentiate between spam and non-spam messages.
There are three different types of machine learning. Each of them is used for a different type of problem.
Supervised learning is the most common type of machine learning. It is used to find patterns in data and is used to predict future behavior based on past experience. In supervised learning, the data is split into two parts, known as the training set and the test set. The training set is used to train the model and the test set is used to evaluate the accuracy of the model.
The goal of supervised learning is to find a relationship between independent variables and dependent variables. The independent variables are the things that we know about the data. For example, the independent variables are the features that are used to describe a customer. The dependent variables are the things we want to know about the data. For example, the dependent variable is the profit made from a specific customer. The relationship between the independent variables and the dependent variables is known as the model.
Supervised learning can be used to predict the profit of a customer based on the features of the customer.
Unsupervised learning is used to find structure in the data. In unsupervised learning, there is no training set. The model is learned from the test set. It can also be used to find groups or clusters in the data or to identify anomalies in the data.
Unsupervised learning can be used to find groups of similar customers.
Reinforcement learning is a type of machine learning that is used to find good actions or decisions based on the data. Reinforcement learning is used to find an optimal action or decision that will maximize the reward. It is used to find the optimal solution to a problem. The optimal solution depends on the reward function.
Reinforcement learning can be used to optimize different types of problems. For example, it can be used to optimize a non-linear function or to find the shortest route in a network (see also Reinforcement Learning: Life is a Maze).
Deep learning is a subset of machine learning that uses artificial neural networks. Artificial neural networks are computational models that are inspired by the architecture of the human brain. They are used to develop algorithms that can learn from data (see also Understanding the Magic of Neural Networks).
Deep learning is used to build models that can classify data or find patterns in the data. Deep learning is used to perform complex tasks such as object recognition, speech recognition, and translation. Deep learning is the most popular type of machine learning.
In this blog post, we explained the difference between artificial intelligence, machine learning, and deep learning.
We also covered the three different types of machine learning (supervised learning, unsupervised learning, and reinforcement learning) and explained how they are related.
It seems almost impossible but this whole post was really written by a very advanced AI and only slightly redacted. You won’t find it anywhere else on the internet, it is unique. I think it is fair to say that we haven’t even begun to understand the full potential of this new technology…
The compIntercepts()
function in FSA
(prior to v0.9.0) was used to statistically compare intercepts for all pairs of groups with the same slope in an indicator/dummy variable regression (I/DVR). However, the excellent emmeans()
function in the emmmeans
package is a more general approach that follows principals similar to those of emtrends()
, which I demonstrated in a post yesterday. As such, compIntercepts()
will be removed from the next version (0.9.0) of FSA
.
In this post I demonstrate how to use emmeans()
for the same purpose as compIntercepts()
was used (prior to FSA v0.9.0). Note, however, that the results will not be identical because compSlopes()
and emtrends()
use different methods to correct for multiple comparisons when comparing pairs of slopes.
The Mirex
data described here, but filtered to include just three years, will be used in this post.^{1}
The lm()
below fits the I/DVR to determine if the relationship between mirex concentration and weight of the salmon differs by year.
The weight:year
interaction term p-value suggests that the slopes (i.e., relationship between mirex concentration and salmon weight) do NOT differ among the three years. However, the year
term p-value suggests that the intercepts of at least one pair of these parallel lines DO differ.^{2}
A next step is to determine which pair(s) of intercepts differ significantly. In preparation for this next step, I fit a model that does not include the insignificant interaction term.^{3}
compIntercepts()
from FSA
The procedure coded in compItercepts()
is described in more detail in this supplement to the Introductory Fisheries Analyses with R book. The results of compIntercepts()
applied to the saved lm()
object are assigned to an object below.^{4}
The $comparisons
component in this saved object contains the results from comparing all pairs of intercepts. Each paired comparison is a row in these results with the groups being compared under comparison
, the differences in sample intercepts under diff
, 95% confidence intervals for the difference in intercepts under 95% LCI
and 95% UCI
, and adjusted (for multiple comparisons) p-values for the hypothesis test comparing the intercepts under p.adj
.
For example, these results suggest that the intercepts for 1982 and 1977 ARE statistically different (first row), but the intercepts for 1986 and 1982 are NOT statistically different (last row).
The $smeans
component in this saved object contains the mean value of the response variable predicted at the mean value of the covariate. For example, the results below show the predicted mean mirex concentration at the overall mean salmon weight (i.e., 3.782083 kg).
Because the lines are known to be parallel, differences in intercepts also represent differences in predicted means of the response at all other values of the covariate. compIntercepts()
defaulted to show these means at the mean (i.e., center) of the covariate. This could be adjusted with common.cov=
in compIntercepts()
. For example, the actual intercepts are shown below.
emmeans()
from emmeans
Similar results can be obtained with emmeans()
from emmeans
using the fitted lm()
object (without the interaction term) as the first argument and a specs=
argument with pairwise~
followed by the name of the factor variable from the lm()
model (year
in this case).
The object saved from emmeans()
is then given as the first argument to summary()
, which also requires infer=TRUE
if you would like p-values to be calculated.^{5}
The $contrasts
component in this saved object contains the results for comparing all pairs of predicted means at the overall mean of the covariate. Each paired comparison is a row with the groups compared under contrast
, the difference in predicted means under estimate
, the standard error of the difference in predicted means under SE
, the degrees-of-freedom under df
, a 95% confidence interval for the difference in predicted means under lower.CL
and upper.CL
, and the t test statistic and p-value adjusted for multiple comparisons for testing a difference in predicted means under t.ratio
and p.value
, respectively.
Comparing these results to the $comparison
component from compIntercepts()
shows that the difference in sample intercepts or predicted means are the same, though the signs differ because the subtraction was reversed. The confidence interval values and p-values are slightly different. Again, this is due to emmeans()
and compIntercepts()
using different methods of adjusting for multiple comparisons. These differences did not result in different conclusions in this case, but they could, especially if the p-values are near the rejection criterion.
The $emmeans
component contains results for predicted means for each group with the groups under the name of the factor variable (year
in this example), the predicted means under emmean
, standard errors of the predicted means under SE
, degrees-of-freedom under df
, 95% confidence intervals for the predicted mean under lower.CL
and upper.CL
, and t test statistics and p-values adjusted for multiple comparisons for testing that the predicted mean is not equal to zero under t.ratio
and p.adj
, respectively. While it is not obvious here, these predict means of the response variable are at the mean of the covariate, as they were for compIntercepts()
.
Here the predicted means match exactly (within rounding) with those in the $means
component of compIntercepts()
.
The means can be predicted at any other “summary” value of the covariate using cov.reduce=
in emmeans()
. For example, the predicted values at the minimum value of the covariate are obtained below.
The following will compute predicted means that represent the actual intercepts.
The emmeans()
function in the emmeans
package provides a more general solution to comparing multiple intercepts (or predicted means on parallel lines) than what was used in compIntercepts()
in the FSA
package (prior to v0.9.0). Thus, compIntercepts()
will be removed from FSA with v0.9.0. You should use emmeans()
instead.
The emmeans
package has extensive vignettes that further explain its use. Their “Basics” vignette is very useful.
Note that this change to FSA
does not affect anything in the published Introductory Fisheries Analyses with R book. However, the specific analysis in this supplement to the book will no longer work as described. The use of compIntercepts()
there will need to be replaced with emmeans()
.
In a previous post I demonstrated how to use emtrends()
from the emmeans
package to replace compSlopes()
, which will also be removed from the next version of FSA
.
I filtered the data to only three years to reduce the amount of output below to make it easier to follow the main concepts.
The weight
term p-value suggests that there is a significant relationsip between mirex concentration and salmon weight, regardless of which year is considered.
Note that use of the additive ‘+’ in this model formula rather than the multiplicative ‘*’.
compIntercepts()
had a print()
function for nicely printing the results. However, here we will look at each component separately to ease comparison with the emtrends()
results.
As a data scientist, or as a leader of a data science team, you know the power and flexibility that open source data science delivers. However, if your team works within a typical enterprise, you compete for budget and executive mindshare with a wide variety of other analytic tools, including ...
The post Code-First Data Science for the Enterprise first appeared on R-bloggers.]]>As a data scientist, or as a leader of a data science team, you know the power and flexibility that open source data science delivers. However, if your team works within a typical enterprise, you compete for budget and executive mindshare with a wide variety of other analytic tools, including self-service BI and point-and-click data science tools. Navigating this landscape, and convincing others in your organization of the value of open source data science, can be difficult. In this blog post, we draw on our recent webinar on this topic to give you some talking points to use with your colleagues when tackling this challenge.
However, it is important to keep in mind that “code-first” does not mean “code only.” While code is often the right choice, most organizations need multiple tools, to ensure you have the right tool for the task at hand.
There are multiple ways to approach any given analytic problem. At their core, various data science and BI tools share many aspects. They all provide a way of drawing on data from multiple data sources, and to explore, visualize and understand that data in open-ended ways. Many tools support some way of creating applications and dashboards that can be shared with others to improve their decision-making.
Since these very different approaches can end up delivering applications and dashboards that may (at first glance) appear very similar, the strengths and nuances of the different approaches can be obscured to decision makers, especially to executive budget holders—which leads to the potential competition between the groups.
However, when taking a codeless approach, it can be difficult to achieve some critical analytic best practices, and to answer some very common and important questions:
sales-data 2020-12 final FINAL Apr 21 NR (4).xlsx
really the most recent version of the analysis?At best, wrestling with questions like these will distract an analytics team, burning precious time that could be spent on new, valuable analyses. At worst, stakeholders end up with inconsistent or even incorrect answers because the analysis is wrong, not the correct version, or not reproducible. This can fundamentally undermine the credibility of the analytics team. Either way, the potential impact of the team for supporting decision makers is greatly reduced.
RStudio’s mission is to create free and open-source software for data science, because we fundamentally believe that this enhances the production and consumption of knowledge, and facilitates collaboration and reproducible research.
At the core of this mission is a focus on a code-first approach. Data scientists grapple every day with novel, complex, often vaguely-defined problems with potential value to their organization. Before the solution can be automated, someone needs to figure out how to solve it. These sorts of problems are most easily approached with code.
With Code, the answer is always yes!
Code is:
Codeless Problem | Code-First Solution |
---|---|
Difficulty tracking changes and auditing work |
Code, coupled with version control systems like git, to track what changed, when, by whom, and why. Code can be logged when run for auditing and monitoring. |
No single source of truth |
Centralized tools to create a single source of truth for data, dashboards, and models. Version control to track multiple versions of code separately without creating conflicts. |
Difficult to extend and reproduce work |
Code enables reproducibility by explicitly recording every step taken. Open-source code can be deployed on many platforms, and is not dependent on proprietary tools. Code can be copied, pasted, and modified to address novel problems as circumstances change. |
Black box constraints on how you analyze your data and present your insights |
Access and combine all your data, and analyze and present it exactly as you need to, in the form of tailored dashboards and reports. Pull in new methods and build on other open source work without waiting for proprietary features to be added by vendors. |
A summary of how a code-first approach helps tackle codeless challenges
When discussing the benefits of a code-first approach within your organization, you may hear some common objections:
If you’d like to learn more about the advantages of code-first data science, and see some real examples in action, watch the free, on-demand webinar Why Your Enterprise Needs Code-First Data Science. Or, you can set up a meeting directly with our Customer Success team, to get your questions answered and learn how RStudio can help you get the most out of your data science.
We introduce the R package socialroulette, which is a lightweight package for handling the recurrent problem of assigning individuals into groups of a fixed size. In order to minimize reunions, we provide an algorithm to solve the task as an instance of the maximally diverse grouping problem.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. The R-markdown source code of this blog is available under a GNU General Public License (GPL v3) license from GitHub.
Groupings are relevant when breakout rooms, mystery lunch partitions or student peer review groups are made. My last blog post Long time, no see: Virtual Lunch Roulette considered the probability of being assigned into the same group as someone, you already were in the same group with the last time (aka. a re-union) when using simple random sampling to assign the groups. This probability proved to be surprisingly high.
In this post we develop the subject further by introducing the R package socialroulette
, which aims at being helpful in making better group assignments based on considering the problem as a solution of the maximally diverse grouping problem (MDGP) known from operations research.
The aim is to partition \(n\) participants into groups of size at least \(m\). We shall denote the resulting groupings a partition of the set of \(n\) participants. If \(m\) is not a divisor of \(n\) then some of the groups have to contain more than \(m\) participants. As an example consider the case that 5 individuals are to be divided into groups of size at least 2. We shall adopt the convention, that group size shall be as close to \(m\) as possible and the group sizes should be as equal as possible. In the specific toy example this means that we will need 2 groups, one with 3 participants and one with 2 participants.
When assigning such groups repeatedly, e.g. one a weekly basis, an additional criterion can be to form the groups, such that as few participants as possible are assigned into groups with individuals they were already in the same group with previously. This can, e.g., be done by rejection sampling. However, as situations can occur where such re-unions are unavoidable, it can instead be natural to formulate a metric to maximize, which quantifies penalities due to such re-unions. As an example: If one has to decide between assigning two individuals to the same group who met 7 days or 14 days ago, it might be preferable to choose the pair which met 14 days ago. Such optimization is known as solving the maximally diverse grouping problem (see appendix).
We illustrate the package by considering a use-case, where a class of students repeatably for every lecture has be assigned into breakout rooms. First, we install and load the socialroulette
package available from GitHub.
# Install package devtools::install_github("hoehleatsu/socialroulette") library(socialroulette)
Say the class consists of 100 students, which for a weekly virtual lab exercise class need to be divided into groups of at least 4. Since for various reasons not all students show up for each class, the sampling frame of individuals to be partitioned each week changes accordingly. Still, we would like to make the partitioning of the current week s.t. students get as many new acquaintances as possible.
We first create a history of the previous 4 weeks’ groupings as well as the frame of participants for the current lecture.
# Class of 100 students with 4 previous lectures students <- tibble::tibble(id=sprintf("id%.3d@student.su.se", 1:100)) partition_dates <- seq(as.Date("2021-03-31"), length.out=4, by="1 week") # Simulate changing participation each week for the last 4 weeks (70% attendance) frames <- map_df( partition_dates, ~ students %>% slice_sample(n = rbinom(1,nrow(.), prob=0.7)) %>% mutate(date=.x)) # Generate some past partitions using simple random sampling past_partitions <- frames %>% group_split(date) %>% map(~rsocialroulette(current_frame=.x, m=4, algorithm="srs")) %>% setNames(partition_dates) ## Partitioning 72 individuals into groups of at least 4 (no past partitions). ## Created 18 groups of sizes 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4. ## Partitioning 74 individuals into groups of at least 4 (no past partitions). ## Created 18 groups of sizes 5 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4. ## Partitioning 71 individuals into groups of at least 4 (no past partitions). ## Created 17 groups of sizes 5 5 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4. ## Partitioning 69 individuals into groups of at least 4 (no past partitions). ## Created 17 groups of sizes 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4.
We pretend that each of the above previous partitions has been saved as a .csv file. For example like:
# Simulate the storage of each partition as a .csv file to disk # with 3 columns: date, id1 and id2, i.e. all pairs temp_dir <- tempdir() #adjust path to your setting if needed socialroulette::partitions_to_pairs( past_partitions ) %>% group_split(date) %>% walk(~ write_csv(x=.x, file=file.path(temp_dir, stringr::str_c("socialroulette-", .$date[1], ".csv"))))
We thus read the partitions from disk and convert from the stored pair-format (i.e. a data.frame
listing each pair being in the same group as id1
, id2
together with the corresponding date
of the partition) back to the list-format (i.e. a list of character vectors, where each vector denotes a group and the vector contains the ids of all members of that group). This can be done as follows:
# Read again from file pairs <- map_df(list.files(path=temp_dir, pattern="socialroulette.*", full.names=TRUE), ~read_csv(file=.x))
As a next step we sample the students who are in the next class to group.
current_frame <- students %>% slice_sample(n = rbinom(1,nrow(.), prob=0.7)) %>% mutate(date=max(partition_dates) + diff(partition_dates) %>% mean())
Our goal is now to partition the 71 students in current_frame
. For each of the \({71 \choose 2 }= 2485\) possible pairs of students in that class, we determine how long ago it has been, since they were in the same group the last time. This can be done using the internal package function partitions_to_distance
:
dist <- socialroulette::partitions_to_distance(current_frame, past_partitions) dist %>% head() ## # A tibble: 6 x 4 ## id1 id2 date dist #### 1 id078@student.su.se id100@student.su.se 2021-04-28 35 ## 2 id078@student.su.se id092@student.su.se 2021-04-28 35 ## 3 id078@student.su.se id098@student.su.se 2021-04-28 35 ## 4 id078@student.su.se id093@student.su.se 2021-04-28 35 ## 5 id078@student.su.se id083@student.su.se 2021-04-28 35 ## 6 id078@student.su.se id095@student.su.se 2021-04-28 35
Since the past partitions contain the last 4 weeks, students who have not been in a group so far are assigned a value of one time difference more than the last possible meeting opportunity. In the specific example this corresponds to \(7\cdot (4 + 1) = 35\) days. It thus seems natural to choose the groups, s.t. they do not contain pairs, which have already met in the past. However, for given size of the population, group sizes and meeting histories, this might not be possible to achieve altogether, so a more flexible criterion is to maximize the sum of distance since the last meet over all pairs in the same group of the partition.
# Make the partition using a mdgp solver partition <- rsocialroulette(current_frame, past_partitions, m=4, algorithm="mdgp")
The partition can be visualized using the igraph
package:
Of course the partition can also be stored to file as before, in order to include it in the set of past partitions when doing the partitioning next week:
list(partition) %>% setNames(current_frame %>% slice(1) %>% pull(date)) %>% socialroulette::partitions_to_pairs() %>% write_csv(file=file.path(temp_dir, stringr::str_c("socialroulette-", .$date[1],".csv")))
or can be stored in a Zoom compatible breakout room specification format:
partition %>% socialroulette::partition_to_frame() %>% rename(email=id) %>% mutate(room = sprintf("room%.03d",group)) %>% select(room, email) %>% write_csv(file=file.path(temp_dir, stringr::str_c("zoom-breakoutrooms.csv")))
We provide functionality to create better groupings than obtained by simple random sampling. However, as mathematically convincing the MDGP solution might be, reunions can have their charm.
We thank Xiangjing Lai and Jin-Kao Hao for contributing their C++ source code of the MDGP solver in Lai and Hao (2016) under a GPL-3 license for the socialroulette package.
In this appendix we provide a few more mathematically details about the maximally diverse grouping problem, which is about partitioning \(n\) individuals into groups of size at least \(m\) while maximizing a sum of utility values computed by summing the utility \(u(i,j)\) over all individuals \(i\), \(j\) partitioned into the same group.
More formally, let \(d_{ij}\) denote the number of time units (typically days) ago, that individual \(i\) and \(j\) were in the same group. Note: This distance is derived by looking at the previous partitions. It is a matter of definition what this value should be, if \(i\) and \(j\) have not previously been in the same group. Let \(G=n\> \text{div}\> m\) denote the resulting number of groups where \(\text{div}\) denotes integer division. For a given partition let \(x_{ig}\) be an indicator variable, which is 1, if \(i\) is assigned into group \(g\) and zero otherwise.
A solver of the maximally diverse grouping problem now tries to maximize \[ \sum_{g=1}^G \sum_{i=1}^n \sum_{j=i+1}^n d_{ij} x_{ig} x_{jg}, \] subject to the conditions \[ \begin{align*} &\sum_{g=1}^{G} x_{ig} = 1,&& i=1,\ldots,n, \\ &\sum_{i=1}^n x_{ig} = n_g,&& g=1,\ldots,G, \\ &x_{ig} \in \{0,1\}, && i=1,\ldots,n;\quad g=1,\ldots,G, \\ \end{align*} \] where \(n_g\) is the size of group \(g\), i.e. \(\sum_{g=1}^G n_g = n\). We shall adopt the convention that group sizes are determined by assigning the labels \(1,\ldots,G\) to the participants by starting from the first to the last, e.g., as \(((\text{index} - 1) \> \text{mod}\> m) + 1\) and then count how often each label occurs. This means, e.g., that for \(n=7\) and \(m=4\) we get \(n_g=7\> \text{div}\> 4=1\) group with 7 members.
The socialroulette
package uses as backend a C++ implementation of MDGP solver of Lai and Hao (2016).
Lai, Xiangjing, and Jin-Kao Hao. 2016. “Iterated Maxima Search for the Maximally Diverse Grouping Problem.” European Journal of Operational Research 254 (3): 780–800. https://doi.org/https://doi.org/10.1016/j.ejor.2016.05.018.
Hello, so a year and a half a go I created a new metric for measuring F1 drivers performance based around there performance in the race and the expected the performance in the race see blog here
F1 Drivers Rated
Since then my laptop BSOD’ed and me being useless I never committed the code to GitHub and therefore it was lost. I was facing rewriting it but I knew there were weaknesses in the model. The main weakness being is that it took no account into the quality of the car. The metric was based on position but I think that’s unfair because technically a driver would score the same if they were expected to finish 15th and finished 13th then if they were expected to finish 3rd and the driver won. Its definitely more value to actually win the race then 13th and should be valued so.
As with all previous formula 1 I have analysed I get my data from the ergast API linked below
So the easiest one to correct is the positions. My idea for correcting this is using a point system. In reality the current formula 1 pints system only provides points for the first 10 finishes. This would leave a lot of drivers expect points as 0 which I don’t think would add anything to the model. Therefore I have created my own point system for the purposes of the model. I want to make sure I have a points system that will cover most grid sizes so I used to the data to look at grid sizes over the years to understand how many positions the point system needs to cover. This model will hopefully be usable across many years of the series.
grid <- all_data %>% group_by(raceId, date) %>% summarise(Gridtot = n()) %>% ungroup() %>% arrange(date) %>% mutate(raceno = 1:n()) ggplot(grid, aes(x = raceno, y = Gridtot)) + geom_line(col = "#32a852") + labs(x = "Race No", y = "Grid Size", title = "F1 Grid Size Through Time") + theme(panel.background = element_blank())
Clearly in the early part of the series there is a lot of volatility in the amount of drivers and teams lining up for a race. In the modern era though the grid size is pretty consistent at between 20 – 24 entries and therefore I’m going to set the models point range up to p24.
Finishing Position | Model Points |
1 | 175 |
2 | 155 |
3 | 138 |
4 | 122 |
5 | 107 |
6 | 93 |
7 | 80 |
8 | 68 |
9 | 57 |
10 | 47 |
11 | 38 |
12 | 30 |
13 | 23 |
14 | 17 |
15 | 12 |
16 | 8 |
17 | 5 |
18 | 3 |
19 | 2 |
20 | 0.8 |
21 | 0.4 |
22 | 0.2 |
23 | 0.1 |
24 | 0 |
Above is the point system used for the model. Valuing the higher positions more then the lower positions and this is what points I will use to calculate the expected points from.
The next part of the improvement was adding some kind of metric to measure the strength of the car that is being used. This should further isolate the driver performance from the car performance so we can be more precise on the performance of the drivers. There are numerous metrics that can be used to measure performance but i think there are two main ones.
I’m going to look at correlation of both metrics and which ever has the most correlation with the amount of points a team scores is the one that will be used in the model.
###creating a data frame with all the data from the ERGAST api all_data2 <- results %>% left_join(races, by = "raceId") %>% left_join(constructors, by = "constructorId") %>% left_join(drivers, by = "driverId") %>% left_join(status, by = "statusId") %>% left_join(grid, by = "raceId") %>% left_join(factor, by = c("Gridtot", "grid")) all_data2$position <- as.numeric(as.character(all_data2$position)) ### using various dplyr functions to calculate the mean starting position by season for each car maxavpos <- all_data2 %>% filter(grid > 0) %>% group_by(year, raceId, constructorRef) %>% summarise(max = min(grid)) %>% ungroup() %>% group_by(year, constructorRef) %>% summarise(meanp = mean(max, na.rm = T)) ## plotting against the total points
qualbest <- qual %>% pivot_longer(cols = 7:9, names_to = "qaul", values_to = "times") %>% separate(times, into = c("m", "s","ms"), sep = ":") qualbest$m <- as.numeric(as.character(qualbest$m)) qualbest$s <- as.numeric(as.character(qualbest$s)) qualbest2_m <- qualbest %>% mutate(qualt = m * 60 + s) %>% group_by(raceId, driverId) %>% slice_min(qualt) %>% ungroup() %>% left_join(races, by = "raceId") %>% left_join(constructors, by = "constructorId") %>% group_by(raceId) %>% slice_min(qualt) %>% select(raceId, qualt) colnames(qualbest2_m)[2] <- "pole" qualbest2<- qualbest %>% mutate(qualt = m * 60 + s) %>% left_join(races, by = "raceId") %>% left_join(constructors, by = "constructorId") %>% left_join(qualbest2_m, by = "raceId") %>% group_by(raceId, name.y) %>% slice_min(qualt) %>% mutate(qualdel = qualt / pole -1 ) %>% select(raceId, name.y, qualt, pole, qualdel) %>% left_join(races, by = "raceId") %>% group_by(year, name.y) %>% summarise(mean_del = mean(qualdel))
The other option is to use the average difference to pole position. The trend between the two features is pretty similar and looks like one is not more predictive then the other. Therefore I’m going to use both features as i think fundamentally they tell different stories
I used the tidy models group of packages to fit a model. The number of points is explained by the difference to pole, the number of retirees, the starting position on the grid and average starting position.
#creating the training and testing data split <- initial_split(all_data3, prop = 0.75) train <- training(split) testing <- testing(split) #going to asses the model across multilple sets of data. The boostrap function creates folds to fit many models and therefore be able to tune the mode f1_folds <- bootstraps(train, strata = points.y) f1_folds ## the recipe points against all the feautres mentioned ranger_recipe <- recipe(formula = points.y ~ ., data = train) ## spec for the model - creating a random forest and first tuning hyperperameters mtry and min_n ranger_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% set_mode("regression") %>% set_engine("ranger") ranger_workflow <- workflow() %>% add_recipe(ranger_recipe) %>% add_model(ranger_spec) ## training a grid of models to test which is the best value for the hyperparameters set.seed(8577) doParallel::registerDoParallel() ranger_tune <- tune_grid(ranger_workflow, resamples = f1_folds, grid = 11 ) #autoplot to see the result autoplot(ranger_tune)
Clearly the best model is mtry around 2 and min node size to around 30. Once that is selected I run the final model on the train data and then apply it to the current season performance. Below is a summary of how each driver is comparing to expected for the year so far after the Spanish grand prix.
Clearly Norris has performed the best compared to what he would be expected to perform. This is a level that not many driver perform at so I expect to see a reversion to a mean as the season goes on. Maybe this metric can be used to predict how two drivers up against each other in the same team will get on. One for a future blog. Thanks for reading full code here
https://github.com/alexthom2/F1DriverAnalysis/blob/master/xpos.Rmd
We are pleased to share that the next free and online LondonR meetup will take place on Wednesday 26th May from 4.25 pm (BST).
We will be joined by three presenters who will share their work in R stats, and we will also be hosting LondonR on a new platform!
To join us, register for a place here.
The post LondonR May 2021 appeared first on Mango Solutions.
Methods in Quantitative Statistical Analysis, What is hypothesis testing?. Hypothesis testing is an act in statistics whereby an analyst tests an assumption regarding a population parameter.
Major four principles involved in a statistical test.
In statistics most of the testing methods are comes under parametric hypothesis and Nonparametric hypothesis.
To understand more on NULL hypothesis click here
In a parametric hypothesis, A statistical hypothesis is an assertion about a parameter of a population.
For example, the life of an electric bulb is 3000 hours ie, µ=3000 hours. Two bulb manufacturing processes produce bulbs of the same average life etc. ie, µ1=µ2.
According to R A Fisher, any hypothesis tested for its possible rejection is called a Null Hypothesis.
Population parameter or parameters which provides an alternate to null hypothesis within the range of pertinent values of the parameters and it is denoted as H_{1}.
Let us understand what are the different types of methods of testing under parametric hypothesis.
The names of Methods in Quantitative Statistical Analysis are
Principal Component analysis (PCA) in R
In the likelihood ratio test, we consider the likelihood functions under H_{0} and under the entire parametric space.
Λ(x)=L(x/ θ0)/L(x/ θ)
The value of Λ(x) lies between 0 and 1.
Student t is the deviation of the estimated mean from its population mean expressed in terms of standard error.
We can test H_{0}:µ= µ_{0} vs H_{1}: µ≠ µ_{0},
H_{0}:µ= µ_{0} vs H_{1}: µ> µ_{0}
or µ< µ_{0}, the test statistic remains the same.
The only difference in the test procedure is that the critical value of t is only on one side.
In R you can calculate the same based on the below syntax.
t.test(x, y, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
To read more on t-test click here
A test of H0: µ= µ0 of a normal population N(µ, σ2) against an alternative hypothesis H_{1} can be tested by normal deviate test provide the population variance σ2 is known or the sample drawn is large say >30.
When the sample size is large enough, it is supposed that the sample variance is almost equal to the population variance. Usually, I is denoted as Z.
Z= (x̅- µ_{0})/( σ/sqrt(n))
In R you can used below mentioned code for Z test
z.test(x, y = NULL, alternative = "two.sided", mu = 0, sigma.x = NULL, sigma.y = NULL, conf.level = 0.95)
What is the minimum number of experimental units required in study?
A Chi-square test is performed which measures the discrepancy between the observed frequencies and theoretically determined frequencies from the assumed distribution for the same event.
If the discrepancy is not large, it is considered that our assumption about the distribution of the variables is correct, otherwise not.
chisq.test(x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
once you created the data frame as a table structure simply use following code
data("mtcars") table(mtcars$carb, mtcars$cyl) chisq.test(mtcars$carb, mtcars$cyl)
If the cell frequencies are less than 5 you can try for fisher’s exact test.
Analysis of variance is a device to split the total variance of an experiment or trial into component variances responsible for contributing towards total variance.
Analysis of variance utilizes the F test.
Each component variance is tested against error variance.
F is the ratio between sample variances and within-sample variance.
F=Between sample variances/Within sample variances
Suppose that we need to decide between two hypotheses H_{0} and H_{1}. In the Bayesian setting, we assume that we know prior probabilities of H_{0} and H_{1}.
That is, we know P(H_{0})=p0 and P(H_{1})=p1, where p0+p1=1.
In the case of tests based on fixed sample size, It is generally not possible to determine the optimum sample sizes so that no extra observations are recorded except those necessitated to reach a decision.
Sometimes the sample is too small to arrive a proper decision. To overcome this problem, in 1947 Wald innovated the probability ratio test.
In this test procedure, a decision about H0 is taken after each successive observation.
That is in SPRT, sample size n is a random variable. So investigator can come up with three decision criteria namely,
Reject H_{0}, Accept H_{0}, or Continue sampling. And the process is continuing until taken a decision either to reject or to accept Ho.
If you want to read more on different type of Nonparametric methods click here
The post Methods in Quantitative Statistical Analysis appeared first on finnstats.
The compSlopes()
function in FSA
(prior to v0.9.0) was used to statistically compare slopes for all pairs of groups in an indicator/dummy variable regression (I/DVR). However, the excellent emtrends()
function in the emmmeans
package is a more general and strongly principaled function for this purpose. As such, compSlopes()
will be removed from the next version (0.9.0) of FSA
.
In this post I demonstrate how to use emtrends()
for the same purpose as compSlopes()
was used (prior to FSA v0.9.0). Note, however, that the results will not be identical because compSlopes()
and emtrends()
use different methods to correct for multiple comparisons when comparing pairs of slopes.
The Mirex
data described here, but filtered to include just three years, will be used in this post.^{1}
The lm()
below fits the I/DVR to determine if the relationship between mirex concentration and weight of the salmon differs by year.
The weight:year
interaction term p-value suggests that the slopes (i.e., relationship between mirex concentration and salmon weight) differs among some pair(s) of the three years.
A next step is to determine which pair(s) of slopes differ significantly.
compSlopes()
from FSA
The procedure coded in compSlopes()
is described in more detail in this supplement to the Introductory Fisheries Analyses with R book. The results of compSlopes()
applied to the saved lm()
object are assigned to an object below.^{2}
The $comparisons
component in this saved object contains the results from comparing all pairs of slopes. Each paired comparison is a row in these results with the groups being compared under comparison
, the differences in sample slopes under diff
, 95% confidence intervals for the difference in slopes under 95% LCI
and 95% UCI
, and unadjusted and adjusted (for multiple comparisons) p-values for the hypothesis test comparing the slopes under p.unadj
and p.adj
, respectively.
For example, these results suggest that the slopes for 1996 and 1992 ARE statistically different (first row), but the slopes for 1999 and 1996 are NOT statistically different (last row).
The $slopes
component in this saved object contains results specific to each slope. The groups are under level
, sample slopes under slopes
, 95% confidence intervals for the slopes under 95% LCI
and 95% UCI
, and unadjusted and adjusted p-values for the test if the slope is different from 0 under p.unadj
and p.adj
, respectively.
For example, the slope for 1992 (last row) appears to be significantly different than 0 and may be between 0.01679 and 0.03628.
emtrends()
from emmeans
Similar results can be obtained with emtrends()
from emmeans
using the fitted lm()
object as the first argument, a specs=
argument with pairwise~
followed by the name of the factor variable from the lm()
model (year
in this case), and var=
followed by the name of the covariate from the lm()
model (weight
in this case), which must be in quotes.
The object saved from emtrends()
is then given as the first argument to summary()
, which also requires infer=TRUE
if you would like p-values to be calculated.^{3}
The $contrasts
component in this saved object contains the results for comparing all pairs of slopes. Each paired comparison is a row with the groups compared under contrasts
, the difference in sample slopes under diff
, the standard error of the difference in sample slopes under SE
, the degrees-of-freedom under df
, a 95% confidence interval for the difference in slopes under lower.CL
and upper.CL
, and the t test statistic and p-value adjusted for multiple comparisons for testing a difference in slopes under t.ratio
and p.value
, respectively.
Comparing these results to the $comparison
component from compSlopes()
shows that the sample slopes are the same, but that the confidence interval values and p-values are slightly different. Again, this is due to emtrends()
and compSlopes()
using different methods of adjusting for multiple comparisons. These differences did not result in different conclusions in this case, but they could, especially if the p-values are near the rejection criterion.
The $emtrends
component contains results for each slope with the groups under the name of the factor variable (year
in this example), the sample slopes under xxx.trend
(where xxx
is replaced with the name of the covariate variable, weight
in this example), standard errors of the sample slopes under SE
, degrees-of-freedom under df
, 95% confidence intervals for the slope under lower.CL
and upper.CL
, and t test statistics and p-values adjusted for multiple comparisons for testing that the slope is not equal to zero under t.ratio
and p.adj
, respectively.
Here the results match exactly with those in the $slopes
component of compSlopes()
.
The emtrends()
function in the emmeans
package provides a more general solution to comparing multiple slopes than what was used in compSlopes()
in the FSA
package (prior to v0.9.0). Thus, compSlopes()
will be removed from FSA with v0.9.0. You should use emtrends()
instead.
The emmeans
package has extensive vignettes that further exaplain its use. For this use case see this discussion. Their “Basics” vignette is also useful.
Note that this change to FSA
does not affect anything in the published Introductory Fisheries Analyses with R book. However, the specific analysis in this supplement to the book will no longer work as described. The use of compSlopes()
there will need to be replaced with emtrends()
.
In the next post I will demonstrate how to use emmeans()
from the emmeans
package to replace compIntercepts()
, which will also be removed from the next version of FSA
.
Here are the links to get set up. 👇
Get the Code
YouTube Tutorial
patchwork
The ggplot2 composer that makes it...
This article is part of R-Tips Weekly, a weekly video tutorial that shows you step-by-step how to do common R coding tasks.
Here are the links to get set up.
What we’re making today combines 3 plots into 1 illustrative storyboard.
The ggplot2 package is great for single plots, but what about creating a storyboard for illustrating ideas, making persuasive arguments, and storytelling?
Now you can make publication-ready storyboards using patchwork, the composer of ggplot2. Patchwork makes it ridiculously simple to combine separate ggplots into the same graphic, arranging plots into a grid and adding figures, labels, and annotations.
The patchwork
package aims to make it easy to combine ggplots by:
Here’s the simplified patchwork syntax. That’s all we need to create a two-column layout with the right-column containing two rows is in the simple syntax. Let’s break it down.
Patchwork has a very simple syntax where we can create layouts super easily. Here’s the general syntax that combines:
This is the basic layout for the Texas Real-Estate Storyboard shown in the chart above.
The most important component to patchwork is that you need to be good at developing ggplot2
plots if you want to make your storyboard look great. For that, I’ll defer to our Ultimate R Cheat Sheet to help you get up to speed.
patchwork
is great for combining plots. But, you’ll still need to learn how to wrangle data with dplyr and visualize data with ggplot2. For those topics, I’ll use the Ultimate R Cheat Sheet to refer to dplyr
and ggplot2
code in my workflow.
Download the Ultimate R Cheat Sheet. Then Click the “CS” next to “ggplot2” opens the Data Visualization with Dplyr Cheat Sheet.
Now you’re ready to quickly reference ggplot2
functions.
Onto the tutorial.
The libraries we’ll need today are patchwork, ggridges, ggrepel, maps, tidyverse, and lubridate. All packages are available on CRAN and can be installed with install.packages()
.
The dataset is the txhousing data that comes with ggplot2.
We’ll start by making individual plots that are components of our final patchwork storyboard.
First, we make a time series plot that shows the smoothed trend in median home prices of all the Texas cities along with their individual trends in median price.
Next is creating a ridgeline plot, a special plot that shows distribution of median home prices by top-10 cities in an aesthetically pleasing visual. PS. I teach data visualization with ggplot2 in-depth in the R for Business Analysis course.
The final plot is a geographic map that showcases the summary of which cities have the highest median home prices in Texas.
The YouTube tutorial does this code justice. Check it out.
You just quickly made a professional storyboard using the ggplot2 and patchwork. Fantastic!
You should be proud.
This article is part of R-Tips Weekly, a weekly video tutorial that shows you step-by-step how to do common R coding tasks. Join today.
Top R-Tips Tutorials you might like:
Experimental Design, when do you call an experimental design a randomized design?
Experimental designs in which the treatments are allocated randomly to the experimental units that come under the category of randomized designs.
Randomized designs are classified as completely randomized design, randomized block design, Latin square design, split pot design, cross over design, family block design, etc…
Principle Component Analysis in R
In this tutorial, we are discussing characteristics of the completely randomized design.
Any experimental design, in general, is characterized by the nature of the grouping of experimental units and the manner the treatments are randomly allocated to the experimental units.
A completely randomized design is the one in which all the experimental units are taken in a single group that is homogeneous as far as possible.
Will take one or two examples for the understanding purpose.
For example, all the field plots taking treatments are having the same type of soil, fertility, soil depth, soil texture, soil temperature, soil moisture, etc…
In another example like all the cows forming a group are of the same breed, same age, same weight, same lactation, etc…
Suppose there are k treatments and the treatments T_{i} is replicated r_{i} times. So we require i=1 to k, Sum of r_{i}=n units. Assign K treatments randomly to n experimental units such that the treatment T_{i} occupies r_{i} units or plots. Such a type of design is called a completely randomized design.
The completely randomized design also known as the non-restriction design.
Naïve Bayes Classification in R
For laboratory experiments, pot experiments, greenhouse experiments, and sometimes animal experiments.
Suppose there are four treatments P1, P2, P3, P4 which are replicated 4, 3, 3, and 5 times respectively.
Then the layout of the completely randomized design having 15 plots and 4 treatments.
Experimental Design
P2 | P2 | P2 |
P3 | P4 | P3 |
P4 | P4 | P1 |
P1 | P1 | P1 |
P3 | P4 | P4 |
Let’s take an example of how to execute a completely randomized design in R.
Following design we are taking 3 treatments and number of subjects is 10. So total experimental unit is 30.
set.seed(325) f = c("Ctrl", "Trt1", "Trt2") # Different Treatment Levels k = 3 # Total Number of Treatment Levels n = 10 # Total subjects per treatment response<-sample(1:20,30,replace=T)#Reponse from the subjects tm<-gl(k, 1, n*k, factor(f)) av<-aov(response ~ tm) summary(av)
Df Sum Sq Mean Sq F value Pr(>F) tm 2 58.2 29.1 0.827 0.448 Residuals 27 950.5 35.2
Since the p-value of 0.448 is greater than the .05 significance level, we do not reject the null hypothesis, which indicates no significant difference was observed between tested treatments.
The post Completely Randomized Experimental Design appeared first on finnstats.
Macro in the Shell
Example
Setting-up Gaurd Rails
Closing
Appendix
Related Alternative
Other Resources
There is many a data science meme degrading excel:
(Google Sheets seems to have escaped most of the memes here.)
While I no longer use ...
The post Macros in the Shell: Integrating That Spreadsheet From Finance Into a Data Pipeline first appeared on R-bloggers.]]>There is many a data science meme degrading excel:
(Google Sheets seems to have escaped most of the memes here.)
While I no longer use it regularly for the purposes of analysis, I will always have a soft spot in my heart for excel^{1}. Furthermore, using a “correct” set of data science tools often requires a bridge^{2}. Integrating a rigorous component into a messy spreadsheet based pipeline can be an initial step towards the pipeline or team or organization starting on a path of continuous improvement in their processes^{3}. Also, spreadsheets are foundational to many (probably most) BizOps teams and therefore are sometimes unavoidable…
In this post I will walk through a short example and some considerations for when you might decide (perhaps against your preferences) to integrate your work with extant spreadsheets or shadow “pipelines” within your organization.
Say your finance organization has some workbook that contains fancy accounting triggered by a VBA macro, the calculation of which you need in your own script based data pipeline (but which also may not be in perfect order^{4}…). You don’t want to go through the effort of reproducing (or worse, be responsible for maintaining) the complicated (and potentially changing) logic that lives within the spreadsheets / macros^{5}. You’ve resolved instead to simply “call” finances spreadsheet / macro / logic in some way.
Often the VBA from finance will live in a macro enabled workbook (.xlsm file). To trigger it programatically, you generally want to write a wrapper VBscript that can then be run through the shell (Stack Overflow thread)^{6}. The VBscript will open the workbook and trigger the macro(s) of interest^{7}.
Note that saying “Use a shell script” is, in a way, almost always an answer for how to incorporate another technology into a pipeline. This is more just a reminder that many tools that are designed more for interactive use often also have a batch mode^{8}. Here I’m writing about triggering VBA macros, but integrating GUI based data piplining tools like Orange or Knime into your pipeline can be set-up similarly.
Passing in arguments
You can pass arguments to a VBScript through the shell (SO thread). Though given that you are already using spreadsheets, it’s also sometimes easier to write data to pre-determined locations or cells (this is often how the workbook was set-up to be used anyways).
See brshallo/macros-shell-examle for a walk-through involving evaluating the present values of predicted deals.
In the example I…
These steps are orchestrated via a “run-all.R” script. With a little more effort these could be formalized via targets (or your pipeline toolkit of choice).
There are many unavoidable limitations to any spreadsheet dependent data pipeline. But here are a few things you can do to keep things sane:
Identify which parts of the document need to stay consistent for your pipeline to keep working^{9}, e.g.
These are not much different from the kinds of considerations that happen when collaborating on any data pipeline^{11}. The responsible party at each step has to adhere to certain structures about the form of the data as they expect it to come-in and the form with which it will proceed to the next step. Lines of communication should be open so that as changes occur to the tool, everyone (who needs to be) is made aware^{12}.
Particularly if the pipeline is used in the wild^{13}, make it easy to get notifications for when things do break^{14}.
If you are using spreadsheets or VBA macros in your data pipeline you probably aren’t worried too much about performance, but there may be a few things you can do to be more efficient.
For example, for the macro-shell-example, the VB script has as (headless) steps opening and closing the excel document after processing each deal. Therefore, processing five deals entails compute time spent on four unnecessary opens and closes. This wasted processing could be corrected with small changes to the VB Script.
Data Scientists should still consider their work in a context of growth & development. I am reminded of the Alan Watts quote:
“People who are responsible for technical development [must] be well-imbued with an ecological philosophy and see the direction of things so they will not keep perpetuating anachronisms.”
For further reading on how data scientists should think about integrating their knowledge into sometimes lower-tech organizational contexts, see the excellent post by Allison Horst and Jacqueline Nolis:
“Merge conflicts: helping data science students merge their advanced skills into existing teams…
What do we do about students trained in R and Python for jobs with Excel, Google Sheets and Access?”
A few other tools worth being aware of if you regularly interface with office products from R.
For integrating with google sheets there is googlesheets4.
For those without CS backgrounds, excel, or BI tools are typically the first analytics tools you get exposed to. Excel’s “Record Macro” feature (which enables watching VBA commands be generated while clicking through excel’s GUI / clicking through spreadsheets) was a helpful step towards me feeling comfortable with scripting. I also believe in the saying that it doesn’t matter so much what you have, but how you use it (this applies broadly to analytics technologies) and have seen many people make impressive things happen with it.︎
For me the bridge came from a friend in my Master’s program who inspired me to embrace R – before that, I’d had the moniker of “excel wizard” from my classmates in graduate school.︎
Or it may simply add another layer of complexity.︎
e.g. it is being run on a local machine, it is not updated on a fully automated basis, some parts have to be inputted manually, etc. EVERY DATA SCIENTIST HAS THEIR SKELETONS!︎
It may not be possible anyways if there is already an established user base for the existing tool… or may require a whole change management process which you can’t commit to.︎
You can run shell commands through R by running system()
or shell()
functions, SO thread – see Related Alternative in the Appendix.︎
Finance might have a VB wizard that can set this up for you.︎
You could even say this about {dplyr} and tidyverse tools which are largely designed with the intent of making interaction and fast iteration easy at the expense of making programming and functional programming and automation slightly more difficult.︎
While ensuring that whatever understanding you have allows flexibility for the owner to adjust the tool (but in a way that keeps in mind dependencies).︎
Nothing is more frustrating than when a data source changes arbitrarily and without notice.︎
Even those handled entirely programatically.︎
These guidelines/requirements are pretty similar to those you would have on any collaborative project.︎
I.e. being used in some way by people other than yourself or your immediate team.︎
For informal pipelines such as the ones you are likely looking at in regards to this post, this can be as simple as including an email that people should reach if things break, or triggering an auto-generated email populating an error message, e.g. via the blastula package in R.︎
Generalized additive models (GAMs) have become an important tool for modeling data flexibly. These models are generalized linear models where the outcome variable depends on unknown smooth functions of some predictor variables, and ...
The post Generalized Additive Models: Allowing for some wiggle room in your models first appeared on R-bloggers.]]>Generalized additive models (GAMs) have become an important tool for modeling data flexibly. These models are generalized linear models where the outcome variable depends on unknown smooth functions of some predictor variables, and where the interest focuses on inference about these smooth functions. In this Methods Bites Tutorial, Sara Stoudt (Smith College) offers a hands-on recap of her workshop “Generalized Additive Models: Allowing for some wiggle room in your models” in the MZES Social Science Data Lab in March 2021.
After reading this blog post and engaging with the applied exercises, readers should:
gam
model,Note: This blog post provides a summary of Sara’s workshop in the MZES Social Science Data Lab. The original workshop materials, including slides and scripts, are available from GitHub.
A live recording of the workshop is available on our YouTube Channel.
Have you ever wanted just a little more wiggle room in your models? You suspect that a covariate is not linearly related to the response on some scale, but rather has a curved relationship to the response. Then a generalized additive model is for you! A generalized additive model (or GAM) is just a more flexible generalized linear model.
To estimate GAMs, you need the following packages:
# For GAMs library(mgcv) # And to deal with the data library(tidyverse) library(lubridate)
And load the data:
data <- read.csv("https://raw.githubusercontent.com/sastoudt/MZES_GAMs/main/Data/dailyStops.csv")
As a starting point, we will consider this generalized linear model of daily number of traffic stops (for more context on the data, go to the source). We get a different level per day of the week and let there be a linear trend across months.
generalized_linear_model <- glm( daily_num_stops ~ driver_race + post_policy + driver_race * post_policy + day_of_week + month, data = data, family = "quasipoisson" )
generalized_linear_model
summary(generalized_linear_model) ## ## Call: ## glm(formula = daily_num_stops ~ driver_race + post_policy + driver_race * ## post_policy + day_of_week + month, family = "quasipoisson", ## data = data) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -26.916 -5.885 -1.194 4.269 38.670 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 5.551688 0.031241 177.707 <2e-16 *** ## driver_raceHispanic -1.997720 0.063874 -31.276 <2e-16 *** ## driver_raceWhite 0.698847 0.026995 25.888 <2e-16 *** ## post_policy 0.302333 0.028405 10.644 <2e-16 *** ## day_of_weekMonday -0.251005 0.028354 -8.852 <2e-16 *** ## day_of_weekSaturday 0.007252 0.026407 0.275 0.7836 ## day_of_weekSunday -0.340416 0.029000 -11.738 <2e-16 *** ## day_of_weekThursday -0.360710 0.029269 -12.324 <2e-16 *** ## day_of_weekTuesday -0.347524 0.029152 -11.921 <2e-16 *** ## day_of_weekWednesday -0.265579 0.028472 -9.328 <2e-16 *** ## month -0.004370 0.002278 -1.918 0.0551 . ## driver_raceHispanic:post_policy 0.029594 0.081672 0.362 0.7171 ## driver_raceWhite:post_policy -0.037925 0.034789 -1.090 0.2757 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for quasipoisson family taken to be 67.57155) ## ## Null deviance: 998623 on 4378 degrees of freedom ## Residual deviance: 284627 on 4366 degrees of freedom ## AIC: NA ## ## Number of Fisher Scoring iterations: 5
But it seems plausible that the number of traffic stops doesn’t just increase or decrease across months, but maybe fluctuates back and forth. Similarly, by fitting a different coefficient for each day of the week, we’re losing out on potential information, like we expect Monday to be more like Tuesday than Friday. That’s where GAMs (in the package mgcv
) can help us.
data$day_of_week_num <- as.numeric(as.factor(data$day_of_week)) ## now want to allow smoothness across days of week rather than ## a separate coefficient for each day generalized_additive_model <- mgcv::gam( daily_num_stops ~ driver_race + post_policy + driver_race * post_policy + s(day_of_week_num, bs = "cc", k = 4) + s(month, bs = "cc"), data = data, family = "quasipoisson" )
generalized_additive_model
summary(generalized_additive_model) ## ## Family: quasipoisson ## Link function: log ## ## Formula: ## daily_num_stops ~ driver_race + post_policy + driver_race * post_policy + ## s(day_of_week_num, bs = "cc", k = 4) + s(month, bs = "cc") ## ## Parametric coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 5.30945 0.02275 233.336 <2e-16 *** ## driver_raceHispanic -1.99774 0.06577 -30.375 <2e-16 *** ## driver_raceWhite 0.69885 0.02780 25.142 <2e-16 *** ## post_policy 0.29755 0.02928 10.163 <2e-16 *** ## driver_raceHispanic:post_policy 0.02961 0.08410 0.352 0.725 ## driver_raceWhite:post_policy -0.03792 0.03582 -1.059 0.290 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Approximate significance of smooth terms: ## edf Ref.df F p-value ## s(day_of_week_num) 1.976 2 72.256 < 2e-16 *** ## s(month) 6.550 8 2.237 0.00648 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## R-sq.(adj) = 0.564 Deviance explained = 70.1% ## GCV = 68.554 Scale est. = 71.641 n = 4379
Now what’s changed here? The syntax looks almost identical except for those s()
functions around day_of_week_num
and month
. These tell the model that we want a smooth relationship between those covariates and the response.
We need to specify what general shape that smooth relationship will take and what dimension it needs to be. The bs
argument stands for basis, which you can think about as a core set of curves that you stack and paste together via various linear combinations to add up to your resulting smooth relationship. We need some structure rather than fitting an arbitrary curve to points.
In this case we use "cc"
, denoting a cyclic basis, that constrains the beginning and end of the curve to line up. It is particularly useful for time-series-esque variables where we don’t want any discontinuities from Monday to Monday or from January to January. Check out the talk recording for more info about other options like "cr"
for cubic regression and "cs"
for cubic shrinkage bases.
We also need to decide how big the model space (the \(k\) parameter) for the smooth is. The model fit is robust to choosing too big of a dimension although the computational complexity will increase, but if we choose something too small we may have constrained the fit too much. More on that in a bit.
With great flexibility comes great responsibility. We can evaluate whether we have fit the model well or if we need further tuning using a variety of diagnostics available to us in R.
Conceptually, we want to compare the effective degrees of freedom (edf
) to the maximum dimension (\(k'\)) and make sure they aren’t too close. I’ll hand-wave and say there is a hypothesis test that will hint at the need to increase \(k\). If the p-value is too small, we may want to increase \(k\).
#gam.check(generalized_additive_model) k.check(generalized_additive_model) ## k' edf k-index p-value ## s(day_of_week_num) 2 1.975597 1.505580 1 ## s(month) 8 6.549638 1.464567 1
Alternatively, we can fit the model with different \(k\) values and do a sensitivity analysis.
Once we fit the model, we may want to look at the actual fitted smooths. We also want to be wary of extrapolation (dangerous with lines, even more problematic with wiggles everywhere), so it is helpful to use the rug = T
argument to plot the data on the same plot as the smooth itself. This way we can disregard parts of hte smooth that aren’t informed on that much data.
plot(generalized_additive_model, rug = T)
At this point, you’ll have to rely on your domain knowledge of the data. Here it seems reasonable to have a peak around Friday and Saturday and within the summer months. (Note: the data is overlapping with the tick marks, so the “rug” is a little hidden.)
If this post has you excited about GAMs, I give some more tips in my talk.
gam
call taking too long? Try a “big” GAM: bam
.s()
for a tensor product te()
.bs = "cr"
for bs = "cs"
.te()
using ti()
to see individual smooths and the leftover relationship between the two numerical covariates.Sara Stoudt is a statistician with research interests in ecology and the communication of statistics. Sara received her bachelor’s degree in mathematics and statistics from Smith College and a doctorate in statistics from the University of California, Berkeley. Her graduate work involved evaluating species distribution and abundance models under model mis-specification. While at Berkeley she was also a Berkeley Institute for Data Science Fellow and worked with Professor Deborah Nolan to teach statistical writing. Keep an eye out for their forthcoming book, Communicating with Data: The Art of Writing for Data Science.
The programme and abstract booklet for the 2021 Insurance Data Science (16 - 18 June) conference (fka R in Insurance) is now out!
We are very excited for a great range of speakers and topics.
Previous
Next
Page: /
Download the Programme and Abstract Booklet
Book your ticket for this three half day event ...
The programme and abstract booklet for the 2021 Insurance Data Science (16 – 18 June) conference (fka R in Insurance) is now out! We are very excited for a great range of speakers and topics.
Download the Programme and Abstract Booklet
Book your ticket for this three half day event by 9 June on Eventbrite.
The conference fee for the Insurance Data Science conference is:
Thanks to our sponsors Swiss Re Institute and Mirai Solutions GmbH!
For more information visit: https://insurancedatascience.org
A couple of years ago, I wrote about the paradoxical difficulty of visualising a difference in means between two groups, while showing both the data and some uncertainty interval. I still feel like many ills in science come from our inability to interpret as simple comparison of means. Anything with more than two groups or a predictor that isn’t categorical makes things worse, of course. It doesn’t take much to overwhelm the intuition.
My suggestion at the time was something like this — either a panel that shows the data an another panel with coefficients and uncertainty intervals; or a plot that shows the with lines that represent the magnitude of the difference at the upper and lower limit of the uncertainty interval.
Alternative 1 (left), with separate panels for data and coefficient estimates, and alternative 2 (right), with confidence limits for the difference shown as vertical lines. For details, see the old post about these graphs.
Here is the fake data and linear model we will plot. If you want to follow along, the whole code is on GitHub. Group 0 should have a mean of 4, and the difference between groups should be two, and as the graphs above show, our linear model is not too far off from these numbers.
library(broom) data <- data.frame(group = rep(0:1, 20)) data$response <- 4 + data$group * 2 + rnorm(20) model <- lm(response ~ factor(group), data = data) result <- tidy(model)
Since the last post, a colleague has told me about the Gardner-Altman plot. In a paper arguing that confidence intervals should be used to show the main result of studies, rather than p-values, Gardner & Altman (1986) introduced plots for simultaneously showing confidence intervals and data.
Their Figure 1 shows (hypothetical) systolic blood pressure data for a group of diabetic and non-diabetic people. The left panel is a dot plot for each group. The right panel is a one-dimensional plot (with a different scale than the right panel; zero is centred on the mean of one of the groups), showing the difference between the groups and a confidence interval as a point with error bars.
There are functions for making this kind of plot (and several more complex alternatives for paired comparisons and analyses of variance) in the R package dabestr from Ho et al. (2019). An example with our fake data looks like this:
Alternative 3: Gardner-Altman plot with bootstrap confidence interval.
library(dabestr) bootstrap <- dabest(data, group, response, idx = c("0", "1"), paired = FALSE) bootstrap_diff <- mean_diff(bootstrap) plot(bootstrap_diff)
While this plot is neat, I think it is a little too busy — I’m not sure that the double horizontal lines between the panels or the shaded density for the bootstrap confidence interval add much. I’d also like to use other inference methods than bootstrapping. I like how the scale of the right panel has the same unit as the left panel, but is offset so the zero is at the mean of one of the groups.
Here is my attempt at making a minimalistic version:
Alternative 4: Simplified Garner-Altman plot.
This piece of code first makes the left panel of data using ggbeeswarm (which I think looks nicer than the jittering I used in the first two alternatives), then the right panel with the estimate and approximate confidence intervals of plus/minus two standard errors of the mean), adjusts the scale, and combines the panels with patchwork.
library(ggbeeswarm) library(ggplot2) ymin <- min(data$response) ymax <- max(data$response) plot_points_ga <- ggplot() + geom_quasirandom(aes(x = factor(group), y = response), data = data) + xlab("Group") + ylab("Response") + theme_bw() + scale_y_continuous(limits = c(ymin, ymax)) height_of_plot <- ymax-ymin group0_fraction <- (coef(model)[1] - ymin)/height_of_plot diff_min <- - height_of_plot * group0_fraction diff_max <- (1 - group0_fraction) * height_of_plot plot_difference_ga <- ggplot() + geom_pointrange(aes(x = term, y = estimate, ymin = estimate - 2 * std.error, ymax = estimate + 2 * std.error), data = result[2,]) + scale_y_continuous(limits = c(diff_min, diff_max)) + ylab("Difference") + xlab("Comparison") + theme_bw() (plot_points_ga | plot_difference_ga) + plot_layout(widths = c(0.75, 0.25))
Literature
Gardner, M. J., & Altman, D. G. (1986). Confidence intervals rather than P values: estimation rather than hypothesis testing. British Medical Journal
Ho, J., Tumkaya, T., Aryal, S., Choi, H., & Claridge-Chang, A. (2019). Moving beyond P values: data analysis with estimation graphics. Nature methods.
Using R and the anova
function we can easily compare nested models. Where we are dealing with regression models, then we apply the F-Test
and where we are dealing with logistic regression models, then we apply the Chi-Square Test
. By nested, we mean that the independent variables of the simple model will be a subset of the more complex model. In essence, we try to find the best parsimonious fit of the data. Note that we should fit the models on the same dataset.
The Null Hypothesis is that the simple model is better and we reject the null hypothesis if the p-value is less than 5% inferring that the complex model is is significantly better than the simple one.
Let’s work with the LifeCycleSavings
dataset by considering as dependent variable the sr
and the rest as independent variables (IV).
Let’s say that we can to compare the following two models:
fit0
which is the \(sr = \alpha\) VSfit1
which is the \(sr = \alpha +\beta \times pop15\)fit0 <- lm(sr ~ 1, data = LifeCycleSavings) fit1 <- lm(sr ~ pop15, data = LifeCycleSavings) summary(fit0) summary(fit1)
Notice the the P-value of the F-Test of the fit1
model is 0.0008866 which actually actually tests the Null Hypothesis that “all the beta coefficients are zero” versus the alternative hypothesis that “at least one beta coefficient is not zero”. Since we have only one beta coefficient, the pop15
the p-value of the F-Test is the same with the p-value of the T-Test as we can see above.
Now, if we compare the fit0
vs the fit1
, in essence, we test if we should include the pop15
coefficient or not, thus we expect to get the same p-value. Let’s compare the nested models using anova
:
anova(fit0, fit1, test='F')
As, expected we got the same p-value, and we can say that we should prefer the fit1
compared to fit0
model.
Let’s make another comparison by comparing the fit1
compared to the fit4
which contains all the IVs.
fit4<-lm(sr~pop15+pop75+dpi+ddpi, data = LifeCycleSavings) summary(fit4)
Let’s compare the two models:
anova(fit1, fit4, test='F')
The p-value is 0.04177 forcing us to reject the null hypothesis that the fit1
models is better. Finally, let’s compare the fit1
model versus the fit3
which contains the first 3 IV of the dataset.
fit3<-lm(sr~pop15+pop75+dpi, data = LifeCycleSavings) anova(fit1, fit3, test='F')
In this case, the p-value is 0.1318 which means that we should accept the null hypothesis that the fit1
is better than the fit3
.
Association strength, when the hypothesis of independence of attributes in a contingency table is rejected by performing a chi-square test, ensures the association between two attributes.
Such kinds of situations interested to calculate the strength of association and it is a desideratum. For this a measure is known as the coefficient of contingency.
The measure of contingency developed by Karl Pearson in 1904.
The coefficient of contingency denoted as C.
C=sqrt(χ2/n+ χ2)
Where n is the sample size. The value of C lies between 0 and 1 and never attains 1.
When C=0 indicates complete dissociation. A value near 1 indicates a high degree of association.
In most cases, C calculated when the null hypothesis rejected.
Chi square distribution examples
The Chi-Square test in R is a statistical method used to determine if two categorical variables have a significant correlation between them.
The two variables are selected from the same population and labeled as categorically.
Syntax: chisq.test() is a function used to perform the test in R
We reject the null hypothesis if the p-value that comes out in the result is less than a significance level, which is 0.05 usually.
H0: The two variables are independent.
H1: The two variables relate to each other.
data("mtcars") table(mtcars$carb, mtcars$cyl)
Contigency table from mtcars
4 6 8 1 5 2 0 2 6 0 4 3 0 0 3 4 0 4 6 6 0 1 0 8 0 0 1
library(gplots) dt<-table(mtcars$carb, mtcars$cyl) balloonplot(t(dt), main ="Balloon Plot", xlab ="", ylab="", label = FALSE, show.margins = FALSE)
Let’s calculate the chi-square value based on chisq.test function
chisq.test(mtcars$carb, mtcars$cyl)
Pearson’s Chi-squared test
data: mtcars$carb and mtcars$cyl X-squared = 24.389, df = 10, p-value = 0.006632 Warning message: In chisq.test(mtcars$carb, mtcars$cyl) : Chi-squared approximation may be incorrect
Now you can see the warning because some of the cell frequencies are less than 5.
According Chi-Square test significant difference was observed between tested attributes. However, will check fisher’s exact results also.
Fisher’s Exact Test for Count Data
fisher.test(mtcars$carb, mtcars$cyl) data: mtcars$carb and mtcars$cyl p-value = 0.0003345 alternative hypothesis: two.sided
There is no changes in the inferences significant difference was observed between variables, now will check the association of variables.
Let’s visualize Pearson residuals using the package corrplot:
library(corrplot) chisq<-chisq.test(mtcars$carb, mtcars$cyl) corrplot(chisq$residuals, is.cor = FALSE)
For a given cell, the size of the circle is proportional to the amount of the cell contribution.
The sign of the standardized residuals is also very important to interpret the association between rows and columns.
Positive residuals are in blue and Negative residuals are in red. Based on the above plot, association observed as high.
Customer Segmentation K Means Cluster
Assocs function fron DescTools returns different association measures simultaneously.
library(DescTools) Assocs(dt) estimate lwr.ci upr.ci Phi Coeff. 0.8730 - - Contingency Coeff. 0.6577 - - Cramer V 0.6173 0.1778 0.7592 Goodman Kruskal Gamma 0.6089 0.3745 0.8432 Kendall Tau-b 0.4654 0.2846 0.6463 Stuart Tau-c 0.4834 0.2981 0.6687 Somers D C|R 0.4319 0.2650 0.5989 Somers D R|C 0.5015 0.2935 0.7095 Pearson Correlation 0.5746 0.2825 0.7692 Spearman Correlation 0.5801 0.2900 0.7725 Lambda C|R 0.4444 0.1198 0.7691 Lambda R|C 0.2727 0.0000 0.5570 Lambda sym 0.3500 0.1128 0.5872 Uncertainty Coeff. C|R 0.4782 0.3693 0.5870 Uncertainty Coeff. R|C 0.3387 0.2752 0.4022 Uncertainty Coeff. sym 0.3965 0.3220 0.4710 Mutual Information 0.7353 - -
From the above table Contingency Coefficient value observed as 0.6577, indicates a high association between the tested variables.
Based on the above analysis, a significant difference was observed between the variables (ie variables are related) and strength of association between the variable is high.
Principal Component Analysis in R
The post How to Measure Contingency-Coefficient (Association Strength) appeared first on finnstats.
Getting started in #rtistry
Artists in the R community have been using the #rtistry hashtag to demonstrate their gorgeous, dynamic art using only the R programming language. Their creations are amazing and they inspired me to try out generative art ...
Artists in the R community have been using the #rtistry
hashtag to demonstrate their gorgeous, dynamic art using only the R programming language. Their creations are amazing and they inspired me to try out generative art this dready Sunday.
I am proud to showcase my first #rtistry plot ever! Kinda reminds me of KidPix (remember KidPix?!). I wanted to share how I did it and welcome any feedback or advice, as this is totally new and I’m not even sure if I am doing it right?
As always, we start with the libraries we will use:
# Load packages library(tidyverse) library(viridis) library(ggdark)
First up is figuring out the function you will use to create the plot. I decided to go with a parametric equation for my first #rtistry
plot. A parametric equation of a curve expresses the coordinates of points of a curve as functions of a variable. This blog post explains the concept very clearly and also has various examples of wonderful parametric equations.
Why parametric equations? First, even simple equations can create beautiful symmetries. Second, they are easy to modify to find the perfect plot.
The simplest parametric equation uses cosine and sine to make the unit circle:
circleFun <- function(center = c(0, 0), diameter = 1, npoints = 100){ r = diameter / 2 tt <- seq(0,2*pi,length.out = npoints) xx <- center[1] + r * cos(tt) yy <- center[2] + r * sin(tt) return(data.frame(x = xx, y = yy)) } dat <- circleFun(c(1, -1), 2.3, npoints = 100) ggplot(dat,aes(x, y)) + geom_path()
Let’s write a function to create a parametric equation. I based this equation on the aforementioned blog post equations. The parameters are:
genFun <- function(center = c(0, 0), npoints = 500, c1 = 2.5, c2 = -5, c3 = 4.28, c4 = 2.3){ t <- seq(0, 2*pi, length.out = npoints) xx <- center[1] + c1*(sin(c2*t)*sin(c2*t))*(2^cos(cos(c3*c4*t))) yy <- center[2] + c1*sin(sin(c2*t))*(cos(c3*c4*t)*cos(c3*c4*t)) a <- data.frame(x = xx, y = yy) return(a) }
Playing around with the function, we see how the graph gets smoother with more points and how the shape changes with different coefficients.
dat <- genFun(c(1,-1), npoints = 100) ggplot(dat, aes(x, y)) + geom_path()
dat <- genFun(c(1,-1), npoints = 500, c1 = 5, c2 = -3, c3 = 5, c4 = 2) ggplot(dat, aes(x, y)) + geom_path()
Now that we have a basic shape, let’s play around with different aspects of the graph:
We started off with geom_path
but can play around with other geoms too. Here it is with geom_line
:
dat <- genFun(c(1,-1), npoints = 5000) ggplot(dat, aes(x, y)) + geom_line()
And with geom_point
:
set.seed(1234) dat <- genFun(c(1,-1), npoints = 500) dat %>% ggplot(aes(x, y)) + geom_point()
The {ggplot2} package has several aesthetic specifications available for plots. A full list can be found here.
We’re going to go ahead and get rid of all the background using theme_void()
.
Let’s go with geom_point
. In this case, we can start playing around with the aesthetics to see what would look interesting. With geom_point
, you can edit the sizes of the points, so we create a column with random point sizes to create some variation.
set.seed(1111) dat <- genFun(c(1,-1), npoints = 5000) %>% mutate(rand_w = sample(n())/3000) dat %>% ggplot(aes(x, y)) + geom_point(size = dat$rand_w) + theme_void()
We could also change the shape of each of the points, but I liked the circles more:
dat %>% ggplot(aes(x, y)) + geom_point(size = dat$rand_w, shape = 8) + theme_void()
We could also change the opacity of the points:
set.seed(1234) dat <- dat %>% mutate(rand_o = sample(n())/5000) dat %>% ggplot(aes(x, y)) + geom_point(size = dat$rand_w, alpha = dat$rand_o) + theme_void()
We can also create a column for random numbers to ascribe colors to each point. I decided to use the magma color palette from the {viridis} package because it is so vibrant. Now that we’re not using only black points, we can use dark_theme_void()
from the {ggdark} package for a fully black background.
set.seed(1234) dat <- dat %>% mutate(rand_c = sample(n())) dat %>% ggplot(aes(x, y, color = rand_c)) + geom_point(size = dat$rand_w, alpha = dat$rand_o) + scale_color_viridis(option = "magma") + dark_theme_void() + theme(legend.position = "none") # remove legend ## Inverted geom defaults of fill and color/colour. ## To change them back, use invert_geom_defaults().
Notice we added rand_w
, rand_o
, and rand_c
so that we can randomly change up the size, opacity, and color of the plot. Let’s go back to our original generative function and them as parameters. Now we can change them without having to add them to the data frame externally from the function. (Apologies for the switching back and forth from dplyr to base R!)
genFun <- function(center = c(0, 0), npoints = 500, c1 = 2.5, c2 = -5, c3 = 4.28, c4 = 2.3, size_denom = 1, opacity_denom = 1, color_denom = 1){ t <- seq(0, 2*pi, length.out = npoints) xx <- center[1] + c1*(sin(c2*t)*sin(c2*t))*(2^cos(cos(c3*c4*t))) yy <- center[2] + c1*sin(sin(c2*t))*(cos(c3*c4*t)*cos(c3*c4*t)) rand_w <- sample(0:20, npoints, replace = TRUE)/size_denom rand_o <- sample(1:100, npoints, replace = TRUE)/opacity_denom rand_c <- sample(1:100, npoints, replace = TRUE)/color_denom a <- data.frame(x = xx, y = yy, rand_w = rand_w, rand_o = rand_o, rand_c = rand_c) return(a) }
Now playing around with the new parameters, I decided to go with this plot:
set.seed(1111) dat <- genFun(c(0, 0), npoints = 5000, c1 = 5, c2 = -3, c3 = 5, c4 = 2, size_denom = 1.5, opacity_denom = 50) dat %>% ggplot(aes(x, y, color = rand_c)) + geom_point(size = dat$rand_w, alpha = dat$rand_o) + scale_color_viridis(option = "magma") + dark_theme_void() + theme(legend.position = "none") # remove legend
What this allows us to do is change up the generative function and build on our plot. I was interested in rotating the plot around the axis.
dat %>% ggplot() + geom_point(aes(x, y, color = rand_c), size = dat$rand_w, alpha = dat$rand_o) + geom_point(aes(-x, -y, color = rand_c), size = dat$rand_w, alpha = dat$rand_o) + geom_point(aes(-y, x, color = rand_c), size = dat$rand_w, alpha = dat$rand_o) + geom_point(aes(-y, -x, color = rand_c), size = dat$rand_w, alpha = dat$rand_o) + scale_color_viridis(option = "magma") + dark_theme_void() + theme(legend.position = "none") + # remove legend ggsave(here::here("public", "img", "rtistry.png"), dpi = 320, height = 6, width = 8)
Tada! My first ever #rtistry plot.
This was my attempt to create generative art in R! It was fun to figure out how on earth to even get started and see how the plots change with new parameters on different attempts. I welcome any thoughts and please share your art using the #rtistry hashtag with me!
We have started a series of articles on tips and tricks for data scientists (mainly in Python and R). In case you have missed:
There are some differences between Numpy Arrays and Python Lists. We will provide some examples of algebraic operators.
‘+’ Operator
import numpy as np alist = [1, 2, 3, 4, 5] # Define a python list. It looks like an np array narray = np.array([1, 2, 3, 4]) # Define a numpy array print(narray + narray) print(alist + alist)
Output:
[2 4 6 8] [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
Note that the ‘+’ operator on NumPy arrays perform an element-wise addition, while the same operation on Python lists results in a list concatenation. Be careful while coding. Knowing this can save many headaches.
‘*’ Operator
It is the same as with the product operator, *
. In the first case, we scale the vector, while in the second case, we concatenate three times the same list.
print(narray * 3) print(alist * 3)
Output:
[ 3 6 9 12] [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
The dot product or scalar product or inner product between two vectors \(\vec a\) and \(\vec a\) of the same size is defined as:
\(\vec a \cdot \vec b = \sum_{i=1}^{n} a_i b_i\)
The dot product takes two vectors and returns a single number.
nparray1 = np.array([0, 1, 2, 3]) # Define an array nparray2 = np.array([4, 5, 6, 7]) # Define an array flavor1 = np.dot(nparray1, nparray2) # Recommended way print(flavor1) flavor2 = np.sum(nparray1 * nparray2) # Ok way print(flavor2) flavor3 = nparray1 @ nparray2 # Geeks way print(flavor3) # As you never should do: # Noobs way flavor4 = 0 for a, b in zip(nparray1, nparray2): flavor4 += a * b print(flavor4)
Output:
38 38 38 38
Another general operation performed on matrices is the sum by rows or columns. Just as we did for the function norm, the axis parameter controls the form of the operation:
nparray2 = np.array([[1, -1], [2, -2], [3, -3]]) # Define a 3 x 2 matrix. sumByCols = np.sum(nparray2, axis=0) # Get the sum for each column. Returns 2 elements sumByRows = np.sum(nparray2, axis=1) # get the sum for each row. Returns 3 elements print('Sum by columns: ') print(sumByCols) print('Sum by rows:') print(sumByRows)
Output:
Sum by columns: [ 6 -6] Sum by rows: [0 0 0]
Assume that you have a list and apart from the list element, you want to get the position of the element, i.e. the index. For this case, the enumerate function will do the trick. For example:
mylist = [10,30,100] for index, element in enumerate(mylist): print(index, element)
Output:
0 10 1 30 2 100
If we want to return a specific number of decimal digits in f-strings we can use the f'{v:.2f}'
where v is our variable. For example:
import numpy as np my_random = np.random.rand(10) for i, n in enumerate(my_random): print(f'The {i} number is {n}')
Now, if we pass the :.2f
we will get 2 decimal places. For example:
for i, n in enumerate(my_random): print(f'The {i} number is {n:.2f}')
We can generate sequences in Python using the range
function but it supports integers only. Of course, we can generate a series of integers and then to divide them elements by a number, like 10, but let’s discuss some more straightforward ways using numpy.
arange
We can generate a series using the numpy arange
as follows:
import numpy as np # start point = 0 , end point (not incl) is 1.05 and step is 0.05 np.arange(0, 1.05, 0.05)
Output:
array([0. , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1. ])
linspace
With numpy linspace
we can specify the starting point, the ending point and the number of points to be returned and then it estimates the steps.
np.linspace(0,1, 21)
Output:
array([0. , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1. ])
The new line feeds can be of the form \n
or \r
or both. We can remove them from a string using the strip()
method.
'this is a string\n'.strip()
Output:
'this is a string'
Note that there is also the lstrip()
and rstrip()
for left and right stripping of the whitespaces in text. We can also be more specific by typing:
'this is a string\n'.strip("\n")
A “special” data structure in R is the “factors”. We are going to provide some examples of how we can rename and relevel the factors. For the next examples, we will work with the following data:
df<-data.frame(ID=c(1:10), Gender=factor(c("M","M","M","","F","F","M","","F","F" )), AgeGroup=factor(c("[60+]", "[26-35]", "[NA]", "[36-45]", "[46-60]", "[26-35]", "[NA]", "[18-25]", "[26-35]", "[26-35]")))
Output:
> df ID Gender AgeGroup 1 1 M [60+] 2 2 M [26-35] 3 3 M [NA] 4 4 [36-45] 5 5 F [46-60] 6 6 F [26-35] 7 7 M [NA] 8 8 [18-25] 9 9 F [26-35] 10 10 F [26-35]
Rename Factors
Let’s say that I want to convert the empty string of Gender to “U” from the Unknown.
levels(df$Gender)[levels(df$Gender)==""] ="U"
Let’s say that we want to merge the age groups. For instance the new categories will be “[18-35]”, “[35+], “[NA]”
levels(df$AgeGroup)[levels(df$AgeGroup)=="[18-25]"] = "[18-35]" levels(df$AgeGroup)[levels(df$AgeGroup)=="[26-35]"] = "[18-35]" levels(df$AgeGroup)[levels(df$AgeGroup)=="[36-45]"] = "[35+]" levels(df$AgeGroup)[levels(df$AgeGroup)=="[46-60]"] = "[35+]" levels(df$AgeGroup)[levels(df$AgeGroup)=="[60+]"] = "[35+]"
Notice that we could have done it in once, but it is very risky because sometimes we can have different order than what we expected.
levels(df$AgeGroup)<-c("[18-35]","[18-35]","[35+]","[35+]","[35+]", "[NA]")
By applying the changed we mentioned before, we get the following data.
> df ID Gender AgeGroup 1 1 M [35+] 2 2 M [18-35] 3 3 M [NA] 4 4 U [35+] 5 5 F [35+] 6 6 F [18-35] 7 7 M [NA] 8 8 U [18-35] 9 9 F [18-35] 10 10 F [18-35]
Relevel Factors
Let’s say that we want the “[NA]” age group to appear first
df$AgeGroup<-factor(df$AgeGroup, c("[NA]", "[18-35]" ,"[35+]"))
Another way to change the order is to use relevel()
to make a particular level first in the list. (This will not work for ordered factors.). Let’s day that we want the ‘F’ Gender first.
df$Gender<-relevel(df$Gender, "F")
By applying these changes, we can see how the factors have changed level.
> str(df) 'data.frame': 10 obs. of 3 variables: $ ID : int 1 2 3 4 5 6 7 8 9 10 $ Gender : Factor w/ 3 levels "F","U","M": 3 3 3 2 1 1 3 2 1 1 $ AgeGroup: Factor w/ 3 levels "[NA]","[18-35]",..: 3 2 1 3 3 2 1 2 2 2
In the real data world, it is quite common to deal with Missing Values (known as NAs). Sometimes, there is a need to impute the missing values where the most common approaches are:
Let’s give an example of how we can impute dynamically depending on the data type.
library(tidyverse) df<-tibble(id=seq(1,10), ColumnA=c(10,9,8,7,NA,NA,20,15,12,NA), ColumnB=factor(c("A","B","A","A","","B","A","B","","A")), ColumnC=factor(c("","BB","CC","BB","BB","CC","AA","BB","","AA")), ColumnD=c(NA,20,18,22,18,17,19,NA,17,23) ) df
We get:
# A tibble: 10 x 5 id ColumnA ColumnB ColumnC ColumnD1 1 10 "A" "" NA 2 2 9 "B" "BB" 20 3 3 8 "A" "CC" 18 4 4 7 "A" "BB" 22 5 5 NA "" "BB" 18 6 6 NA "B" "CC" 17 7 7 20 "A" "AA" 19 8 8 15 "B" "BB" NA 9 9 12 "" "" 17 10 10 NA "A" "AA" 23
For the Categorical Variables, we are going to apply the “mode” function which we have to build it since it is not provided by R.
getmode <- function(v){ v=v[nchar(as.character(v))>0] uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] }
Now that we have the “mode” function we are ready to impute the missing values of a dataframe depending on the data type of the columns. Thus, if the column data type is “numeric” we will impute it with the “mean” otherwise with the “mode“. Notice that in our script we take into account the column names and “dplyr” package requires a special notation (!!cols : = !!rlang::sym(colname)) of selecting dynamically the column names.
for (cols in colnames(df)) { if (cols %in% names(df[,sapply(df, is.numeric)])) { df<-df%>%mutate(!!cols := replace(!!rlang::sym(cols), is.na(!!rlang::sym(cols)), mean(!!rlang::sym(cols), na.rm=TRUE))) } else { df<-df%>%mutate(!!cols := replace(!!rlang::sym(cols), !!rlang::sym(cols)=="", getmode(!!rlang::sym(cols)))) } } df
Voilà! The missing values have been imputed!
> df # A tibble: 10 x 5 id ColumnA ColumnB ColumnC ColumnD1 1 10 A BB 19.2 2 2 9 B BB 20 3 3 8 A CC 18 4 4 7 A BB 22 5 5 11.6 A BB 18 6 6 11.6 B CC 17 7 7 20 A AA 19 8 8 15 B BB 19.2 9 9 12 A BB 17 10 10 11.6 A AA 23
In the previous post, we showed how we can assign values in Pandas Data Frames based on multiple conditions of different columns.
Again we will work with the famous titanic
dataset and our scenario is the following:
Age
is NA
and Pclass
=1 then the Age=40Age
is NA
and Pclass
=2 then the Age=30Age
is NA
and Pclass
=3 then the Age=25Age
will remain as isLoad the Data
library(dplyr) url = 'https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv' df = read.csv(url, sep="\t")
Use of case_when function of dplyr
For this task, we will use the case_when
function of dplyr
as follows:
df<-df%>%mutate(New_Column = case_when( is.na(Age) & Pclass==1 ~ 40, is.na(Age) & Pclass==2 ~ 30, is.na(Age) & Pclass==3 ~ 25, TRUE~Age ))
Let’s have a look at the Age, Pclass and the New_Column that we created.
df%>%select(Age, Pclass, New_Column)
As we can see we get the expected results.
Age Pclass New_Column 1 22.00 3 22.00 2 38.00 1 38.00 3 26.00 3 26.00 4 35.00 1 35.00 5 35.00 3 35.00 6 NA 3 25.00
Power analysis in Statistics, there is a probability of committing an error in making a decision about a hypothesis.
Hence two types of errors can occur in hypothesis, Type I error and Type II Error.
The probability of Type I error is denoted as α and the probability of Type II error is β.
Type 1 Error:- p(reject H_{0}/H_{0} is true)=α
Type II Error:- p(accept H_{0}/H_{1} is true)=β
We find Type II error is more serious than Type I error. Hence α may be relatively larger than β.
For testing a hypothesis H_{0} against H_{1}, the test with probabilities α and β of Type I and Type II errors respectively, the quantity (1- β) is called the power of the test.
The power of the test depends upon the difference between the parameter value specified by H_{0} and the actual value of the parameter.
The level of significance may be defined as the probability of Type I error which is ready to tolerate in making a decision about H_{0}.
The size of a non-randomized test is defined as the size of the critical region. Practically, it is numerically the same as the level of significance.
Power analysis is one of the important aspects of experimental design. It allows us to determine the sample size required to detect an effect of a given size with a given degree of confidence.
And it allows us to determine the probability of detecting an effect of a given size with a given level of confidence, under-sample size constraints.
The power of the test is too low we ignore conclusions arrived from the data set.
In R, the following parameters required to calculate the power analysis
If we have any of the three parameters given above, we can calculate the fourth one.
Logistic Regression R tutorial
Following table provide the power calculations for different types of analysis.
Function | Power Calculation For |
pwr.2p.test | two proportions equal n |
pwr.2p2n.test | two proportions unequal n |
pwr.anova.test | balanced one way anova |
pwr.chisq.test | chi square test |
pwr.f2.test | general linear model |
pwr.p.test | proportion one sample |
pwr.r.test | correlation |
pwr.t.test | t-tests (one sample, 2 samples, paired) |
pwr.r.test | t-test (two samples with unequal n) |
The significance level α defaults to be 0.05.
Finding effect size is one of the difficult tasks. Your subject expertise needs to brought to be here. Cohen gives the following guidelines for the social sciences. For more details about effects size you can refer here
Effect size | Cohen’s w |
Small | 0.10 |
Medium | 0.30 |
Large | 0.50 |
Here are some examples carried out in R
library(pwr)
For a one-way ANOVA comparing 4 groups, calculate the sample size needed in each group to obtain a power of 0.80, when the effect size is moderate (0.25) and a significance level of 0.05 is employed.
pwr.anova.test(k=4,f=.25,sig.level=.05,power=.8)
Balanced one-way analysis of variance power calculation
k = 4 n = 44.59927 f = 0.25 sig.level = 0.05 power = 0.8 NOTE: n is number in each group
What is the power of a one-tailed t-test, with a significance level of 0.05, 12 people in each group, and an effect size equal to 0.75?
Principal Component Analysis in R
pwr.t.test(n=12,d=0.75,sig.level=.05,alternative="greater")
Two-sample t test power calculation
n = 12 d = 0.75 sig.level = 0.05 power = 0.5538378 alternative = greater NOTE: n is number in *each* group
Using a two-tailed test proportions, and assuming a significance level of 0.05 and a common sample size of 20 for each proportion, what effect size can be detected with a power of .75?
pwr.2p.test(n=20,sig.level=0.05,power=0.75)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.8331021 n = 20 sig.level = 0.05 power = 0.75 alternative = two.sided NOTE: same sample sizes
Read more about Exploratory analysis in R
The post Power analysis in Statistics with R appeared first on finnstats.
Obviously, it seems absolutely essential to put a lot of resources into ramping up world-wide vaccine production with the goal to ensure that quickly almost everybody in the world has the chance to become vaccinated. But is a waiver of patents for Covid-19 vaccines, as the US government recently supported a good step towards this direction?
In the end nobody can know for sure, but such a patent waiver seems to me a very risky strategy that may well slowdown worldwide vaccination speed. Intuitively, I feel that the counter-arguments (e.g. here) are quite reasonable. I just wanted to write down some thoughts on the topic, without claiming to know the best strategy.
This is how Bloomberg quotes the German government’s position towards Biden’s proposal:
“The limiting factor for the production of vaccines are manufacturing capacities and high quality standards, not the patents,” the German government spokeswoman said.
Keeping high production standards requires costly investments by firms, in particular when producing mRNA vaccines. Patents make it much more likely that prices can be achieved that allow to recover those investment costs. If patents are waived, it is really not obvious whether indeed more investments into high quality production capacities take place. Also most observers assume that such investment would take considerable time. How sure are we that this would be faster than letting existing patent protected producers further expand their production network?
Maybe in such a crisis more production capacities may be beneficial even if the production standard is lowered? That might then be achieved faster and at a larger scale. Yet, that seems a very risky strategy. For example, there were recent news reports that in several African countries there is substantial reluctance against vaccinations that may even lead to destruction of expired vaccines that could not be vaccinated quickly enough.
But what will be the effect if absent patents some novel producers lower production standards so that we have increased risk of bad vaccine batches? One can well imagine how such events may substantially reduce worldwide willingness to get vaccinated…
My feeling that guaranteeing vaccine producers sufficient profits may be the best way to quickly ramp-up worldwide supply is also strongly influenced by our European experience of supply reliability of Biontech/Pfizer vs AstraZeneca.
According to leaked price lists the EU paid less than 2 Euro for an AstraZeneca jab but 12 Euro or 15 Euro for a Biontech/Pfizer jab.
But the EU states that in the first quarter of 2021 AstraZeneca delivered only 30 million out of contracted 90 million jabs to the EU and in the 2nd quarter probably only 70 out of 180 million. In contrast, Biontech/Pfizer seemed to have substantially increased supply beyond initial agreements (see e.g. here) and now expects to produce 3 billion shots in 2021.
Sure, there may be many reasons for those outcomes, e.g. possibly a decision of AstraZeneca to preferably honor its contract with the UK at cost of its contract with the EU. But it suggests that too low profit margins may deteriorate incentives or abilities to effectively expand production.
But what about the worry that poorer countries just cannot afford the prices vaccine producers want to charge? Of course, there are extremely good arguments that richer countries should strongly support poorer countries to buy vaccines, e.g. through initiatives like Covax.
That current volumes of Covax vaccines are still low might be due to the fact that, to different extent, western countries first wanted to secure enough vaccines themselves. E.g. here is Statista figure of domestic use and exports of vaccines for different important players:
Given that the US and UK now already reached high vaccination rates, they will probably soon join those countries that are willing to export substantial amounts of vaccines. This hopefully will speed up vaccine supply in other countries. It probably also would help strongly if access barriers to necessary raw materials for vaccine productions would be relaxed.
Sure, a patent waiver might be cheaper for rich countries’ public budgets than helping poorer countries to buy vaccines from the original producers. But would costs without a patent waiver be really so much higher?
Let us consider the 15.50 Euro Biontech/Pfizer may have charged the EU per dosage. Buying at this price two dosages for everyone of the 1.370 billion inhabitants of Africa would cost 42.5 billion Euro. In comparison, here is a study that estimates the economic costs of two months lock down just in Germany to be up to ten-fold this amount (between 255 and 495 billion Euro).
Prices of non mRNA vaccines are much cheaper. And, at least for smaller quantities, Biontech/Pfizer already agreed to sell vaccines to Covax at cost. (Of course, it is not clear how scalable that pricing scheme is. Recall my argument from above that allowing sufficient profit margins likely provides much better incentives to expand production capacities.) Even if patents are waived prices probably won’t fall below production costs and it is not obvious why costs should be lower for new market entrants.
Well in theory one can imagine a lot, like greedy vaccine makers that leverage their patents too make as much profits as possible, e.g. by limiting licensing to create under-supply that increases prices. But I really didn’t see any evidence suggesting such behavior at all. In my view, most vaccine developers are rather heroes without whom the world would be in much worse shape for years to come.
Let us run some R code to look at the development of stock prices for Pfizer and Biontech:
library(quantmod) getSymbols(c("PFE","BNTX"), src = "yahoo", from = "2020-01-01", to = "2021-05-07") stocks = cbind(biontech = BNTX[,4], pfizer = PFE[,4]) library(ggplot2) ggplot(stocks, aes(x=Index)) + scale_y_continuous() + geom_line(aes(y=biontech), color="blue")+ geom_line(aes(y=pfizer), color="black")+ ylab("") + xlab("Date")+ theme_bw()
The Pfizer stock price (black line) has almost not changed at all compared to January 2020. Sure Biontech’s stock price (blue line) has increased roughly 5 fold. But is that really excessive for a start-up company with such massive impact?
Also note that even firms that have nothing to do with vaccine production that had large stock price increases during the crisis. E.g. Amazon’s stock price nearly doubled in the same time interval.
Should we really make a world-wide agreement that reduces profits of vaccine producers? The pharmaceutical business is extremely risky. Many attempts to create vaccines or other drugs fail and can create substantial losses. Of course, if a company is lucky, one drug can make large profits. But without this chance to compensate for the losses of unsuccessful drugs, I strongly fear that development of future drugs and vaccines may substantially slow down.
While in general, I am quite skeptic about excessive intellectual property rights (e.g. the idea of software patents just feels completely wrong to me), drug & vaccine development is the one sector where I really see the need for IP to compensate for the huge development costs and risks.
So for the pharmaceutical sector, I would again agree with the statement of the German government towards Biden’s proposal:
The protection of intellectual property is a source of innovation and this has to remain so in the future.
R Consortium’s R User Group and Small Conference Support Program (RUGS) provides grants to help R groups around the world organize, share information and support each other. The wealth of knowledge in the community and the drive to learn and improve is inspiring. With the wide disruption of events due to COVID-19 this year, we are interested in how groups are getting on despite the disruption in normal communication. The pandemic has focused our interest to look at how we communicate both at conferences as well as communicating with people outside of the main events while navigating the new challenges that the pandemic has created.
We talked with Andrew Collier, Data Scientist, Exegetic Analytics, and satRday conference organizer, to find out how the R community in South Africa is faring.
RC: How has COVID affected your ability to connect with members?
Well, we have run our conference for four years in succession. Actually, in 2020 we had the conference on the day that they announced the first official COVID case in South Africa. It was an in-person conference, but basically it was immediately before lockdown. That’s completely derailed our plans to do anything this year. I don’t really think I have the appetite for organizing an online conference. If I’m going to organize a conference I want to see people in the flesh.
That’s on the conference side, but as far as the meetups are concerned they are still soldiering on. Actually, I gave a talk for R-Ladies Cape Town towards the end of last year and they are still holding fairly regular meetings. It doesn’t seem to have a massive impact on them. If anything, I suspect that they are finding the logistics of organizing the meetings easier because they don’t have to organize a venue or eats or that kind of thing.
RC: How did the technology to connect change over the last year?
It varies. I know that we did the talks to R-Ladies Cape Town on Google Meet. But, I’ve also seen them using Zoom, and I’ve also seen them using Microsoft Teams as well. All kinds of platforms.
RC: Can you tell us about one recent presentation or speaker that was especially interesting and what was the topic and why was it so interesting?
If I could pick out a talk I really enjoyed, and was spectacularly prescient at the time, it would be a talk at satRday last year where Robert Bennetto told us about COVID and what was going to happen before it happened!. So he had done a bunch of modeling, and everyone else was thinking “Hum, what’s the COVID thing all about,” and he stood up on stage and told us and within days what we had been told at the conference was our new reality. People didn’t see the relevance at the time, and they didn’t realize the enormity of what was about to happen.
RC: Do you know of any data journalism efforts by your members? If not, are there particular data journalism projects that you’ve seen in the last year that you feel had a positive impact on society?
I’m not aware of anyone who has been to the conference who is a data journalist. I have a couple of contacts in Cape Town who are data journalists but they don’t use R as their main platform. The other thing that I have seen a lot of journalists do is that they go so far in R but then they take it across to another tool … to tweak the presentation. Like take it across into inkscape, and then actually build the visualization in inkscape, which to my mind kinda defeats the object of having a reproducible system if component of it has to be done by hand.
RC: When is your next event? Please give details!
We don’t have any immediate plans. I don’t think we are doing it this year. February, March, April worked out to be a good time relative to school terms and public holidays and things. It was a time where we could find a weekend when most people didn’t have any commitments. And the rest of the year that tended to be tricky. If it was to happen again it would be the same time next year.
RC: On social distancing on possible future events
I would prefer not to do that, but I think that realistically we would have to. Unfortunately it does make social interactions more difficult.
RC: Of the Active Working Groups, which is your favorite? Why is it your favorite?
The R-Ladies group is my favorite, and I am a little bit awed by how successful that group is. I have tried to run meetups myself, and it’s hard work. And it’s not particularly rewarding because at least where I live people will sign up for a meeting, and say I’m coming and on paper you’ll have 50 people coming to your meeting, and you’ll arrive having catered and only a few people arrive. I’m not exaggerating, that’s a realistic scenario. So, that’s a little bit disheartening. But the R-Ladies are well attended so, I don’t know what the secret sauce is, but it does work.
R Consortium’s R User Group and Small Conference Support Program (RUGS) provides grants to help R groups around the world organize, share information and support each other. We have given grants over the past 4 years, encompassing over 65,000 members in 35 countries. We would like to include you! Cash grants and meetup.com accounts are awarded based on the intended use of the funds and the amount of money available to distribute. We are now accepting applications!
The post Spectacularly Prescient – satRday South Africa Covered COVID Data Before Pandemic Hit appeared first on R Consortium.
All the general advantages of containerized applications apply to Shiny apps. Docker provides isolation to applications. Images are immutable: once build it cannot be changes, and if the app is working, it will work the same in the future. Another important consideration is scaling. Shiny apps are single threaded, but running multiple instances of the same image can serve many users at the same time. Let's dive into the details of how to achieve this.
This post is based on the analythium/shinyproxy-hello GitLab project. Readt about the Shiny example and basic docker concepts before continuing. The project contains the demo Shiny app in the app
folder:
. ├── app │ ├── global.R │ ├── server.R │ └── ui.R ├── .gitignore ├── .gitlab-ci.yml ├── Dockerfile ├── LICENSE ├── README.md └── shinyproxy-hello.Rproj
First you'll learn how to work with an existing image.
You can pull the image made from the GitLab project's container registry using the docker pull
CLI command (you need to have the Docker Desktop running on our local machine):
docker pull registry.gitlab.com/analythium/shinyproxy-hello/hello
The image is tagged as REGISTRY/USER/PROJECT/IMAGE:TAG
. In this case the TAG
is latest
which is the default tag when not specified otherwise. (When the REGISTRY
os not provided, Docker uses the Docker Hub as the default and you can follow the USER/IMAGE:TAG
pattern).
You don't need to authenticate for public images (like this one), but in case you are trying to pull a private image from a private GitLab project, you need to log into the GitLab Container registry as:
docker login registry.gitlab.com
This command will ask for our credentials interactively. If you want, you can provide your username and password. But it is usually recommended to use a personal access token (PAT) instead of your password because PAT can have more restricted scopes, i.e. only used to (read) access the container registry which is a lot more secure.
cat ~/my_password.txt | docker login --username USER --password-stdin
where ~/my_password.txt
is a file with the PAT in it, USER
is the GitLab username.
After pulling the image, you can use docker run
to run a command in a new container based on the image. Use the -p 4000:3838
argument and the image tag:
docker run -p 4000:3838 registry.gitlab.com/analythium/shinyproxy-hello/hello
The -p
is a shorthand for --publish
, that instructs Docker to publish a container’s port to the host port. In our example, 3838 is the container's port which is mapped to the port 4000 of the host machine. As a result, you can visit 127.0.0.1:4000
where you'll find the Shiny app. Hit Ctrl+C to stop the container.
Read all about the docker run
command here. You'll learn about the 3838 port in a bit.
When we discussed local hosting of Shiny apps and runApp
, we did not review all the possible arguments to this R function. Besides the app location (app object, list, file, or directory) there are two other important arguments:
host
: this defines the IP address (defaults to 'localhost': 127.0.0.1),port
: TCP port that the application should listen on; a random port when no value provided.When you run the shiny app locally, you see a message Listening on http://127.0.0.1:7800
or similar, which is the protocol (HTTP), the host address, and the port number. The Shiny app is running in a web server that listens to client requests, and provides a response.
So far you saw how to use the basic docker commands to pull and run images. Now you'll build a Docker image by recreating the simple Shiny app that we worked with before.
Create a file named Dockerfile
(touch Dockerfile
) then open the file and copy the following text into the file and save:
FROM rocker/r-base:latest LABEL maintainer="USER" RUN apt-get update && apt-get install -y --no-install-recommends \ sudo \ libcurl4-gnutls-dev \ libcairo2-dev \ libxt-dev \ libssl-dev \ libssh2-1-dev \ && rm -rf /var/lib/apt/lists/* RUN install.r shiny RUN echo "local(options(shiny.port = 3838, shiny.host = '0.0.0.0'))" > /usr/lib/R/etc/Rprofile.site RUN addgroup --system app \ && adduser --system --ingroup app app WORKDIR /home/app COPY app . RUN chown app:app -R /home/app USER app EXPOSE 3838 CMD ["R", "-e", "shiny::runApp('/home/app')"]
You can use the docker build
command to build the image from the Dockerfile
:
docker build -t registry.gitlab.com/analythium/shinyproxy-hello/hello .
The -t
argument is the tag, the .
at the end refers to the context of the build, which is our current directory (i.e. where the Dockerfile
resides) with a set of files based on which the image is built. So this is where the app
folder and the R scripts need to be placed.
Once the docker image is build, you can run the container as before to make sure the app is working as expected:
docker run -p 4000:3838 registry.gitlab.com/analythium/shinyproxy-hello/hello
Push the locally build Docker image to the container registry:
docker push registry.gitlab.com/analythium/shinyproxy-hello/hello
The image tag should start with the registry name unless you are pushing to Docker Hub. When the image tag is not specified, Docker will treat the new image as :latest
automatically. Read more about Docker tags and semantic versioning here.
Let's review the Dockerfile
line by line. The full Dockerfile reference can be found here.
The FROM
instruction initializes a new build stage and sets the base image.
Take the latest r-base
image from the rocker
project (see on Docker Hub):
FROM rocker/r-base:latest
The LABEL
instruction is optional, it adds metadata to an image, e.g. who to contact in case of issues or questions:
LABEL maintainer="USER"
The RUN
instruction executes the command in a new layer (a layer is modification to the image) on top of the current image. The following command updates the base image with a couple of libraries that are required by Shiny and related R packages (system dependencies):
RUN apt-get update && apt-get install -y --no-install-recommends \ sudo \ libcurl4-gnutls-dev \ libcairo2-dev \ libxt-dev \ libssl-dev \ libssh2-1-dev \ && rm -rf /var/lib/apt/lists/*
The following RUN
command uses the littler command line interface shipped with the r-base
image to install the Shiny package and its dependencies:
RUN install.r shiny
The next command sets the options in the Rprofile.site
file which are going to be loaded by the R session. These options specify the Shiny host and port that runApp
will use.
Do not run containers as root in production. Running the container with root privileges allows unrestricted use which is to be avoided in production. Although you can find lots of examples on the Internet where the container is run as root, this is generally considered bad practice.
Switching to the root USER
opens up certain security risks if an attacker gets access to the container. In order to mitigate this, switch back to a non privileged user after running the commands you need as root. – Hadolint rule DL3002
The following command creates a Linux group and user, both called app
.
This user will have access to the app instead of the default root user:
RUN addgroup --system app \ && adduser --system --ingroup app app
You can read more about security considerations here and Dockerfile related code smells here. Read about best practices and linting rules in general that can be helpful in reducing vulnerabilities of your containerized application.
The WORKDIR
instruction sets the working directory for subsequent instructions. Change this to the home folder of the app
user which is /home/app
:
WORKDIR /home/app
The COPY
instruction copies new files or directories from the source (our app
folder containing the R script files for our Shiny app) and adds them to the file system of the container at the destination path (.
refers to the current work directory defined at the previous step):
COPY app .
The next command sets permissions for the app
user:
RUN chown app:app -R /home/app
The USER
instruction sets the user name (or UID) and optionally the user group (or GID) to use when running the image:
USER app
The EXPOSE
instruction tells Docker which ports the container listens on at runtime. Set this to the Shiny port defined in the Rprofile.site
file:
EXPOSE 3838
Finally, the CMD
instruction closes off our Dockerfile
. The CMD
instruction provides the defaults for an executing container. There can only be one CMD
instruction in a Dockerfile
(only the last CMD
will take effect). Our CMD
specifies the executable ("R"
) and parameters for the executable in an array. The -e
option means you are running an expression that is shiny::runApp('/home/app')
. The expression will run the Shiny app that we copied into the /home/app
folder:
CMD ["R", "-e", "shiny::runApp('/home/app')"]
Build the image using docker build
by specifying the tag (-t
) and the context (.
indicates the current directory):
docker build -t registry.gitlab.com/analythium/shinyproxy-hello/hello .
You can test and push the locally build Docker image to the container as before.
With the newfound ability to wrap any Shiny app in a Docker container, you'll be able to deploy these images to many different hosting platforms. Of course, there is a lot more to learn, e.g. about handling dependencies, persisting data across sessions and containers, and so on. We'll cover these use cases in due time. Until then, celebrate this milestone, check out further readings, and try to containerize some of your own Shiny apps.
PCA is used in exploratory data analysis and for making decisions in predictive models.
PCA commonly used for dimensionality reduction by using each data point onto only the first few principal components (most cases first and second dimensions) to obtain lower-dimensional data while keeping as much of the data’s variation as possible.
The first principal component can equivalently be defined as a direction that maximizes the variance of the projected data.
The principal components are often analyzed by eigendecomposition of the data covariance matrix or singular value decomposition (SVD) of the data matrix.
Reducing the number of variables from a data set naturally leads to inaccuracy, but the trick in the dimensionality reduction is to allow us to make correct decisions based on high accuracy.
Always smaller data sets are easier to explore, visualize, analyze, and faster for machine learning algorithms.
In this tutorial we will make use of iris dataset in R for analysis & interprettion.
data("iris") str(iris)
data.frame’: 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
In this datasets contains 150 observations with 5 variables.
summary(iris) Sepal.Length Sepal.Width Petal.Length Petal.Width Species Min. :4.3 Min. :2.0 Min. :1.0 Min. :0.1 setosa :50 1st Qu.:5.1 1st Qu.:2.8 1st Qu.:1.6 1st Qu.:0.3 versicolor:50 Median :5.8 Median :3.0 Median :4.3 Median :1.3 virginica :50 Mean :5.8 Mean :3.1 Mean :3.8 Mean :1.2 3rd Qu.:6.4 3rd Qu.:3.3 3rd Qu.:5.1 3rd Qu.:1.8 Max. :7.9 Max. :4.4 Max. :6.9 Max. :2.5
Lets divides the data sets into training dataset and test datasets.
Exploratory Data Analysis in R
set.seed(111) ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.8, 0.2)) training <- iris[ind==1,] testing <- iris[ind==2,]
library(psych)
First will check the correlation between independent variables. Let’s remove the factor variable from the dataset for correlation data analysis.
pairs.panels(training[,-5], gap = 0, bg = c("red", "yellow", "blue")[training$Species], pch=21)
Lower triangles provide scatter plots and upper triangles provide correlation values.
Petal length and petal width are highly correlated. Same way sepal length and petal length , Sepeal length, and petal width are also highly correlated.
This leads to multicollinearity issues. So if we predict the model based on this dataset may be erroneous.
One way handling these kinds of issues is based on PCA.
Principal Component Analysis is based on only independent variables. So we removed the fifth variable from the dataset.
pc <- prcomp(training[,-5], center = TRUE, scale. = TRUE) attributes(pc)
$names
[1] "sdev" "rotation" "center" [4] "scale" "x" $class [1] "prcomp" pc$center Sepal.Length Sepal.Width Petal.Length 5.8 3.1 3.6 Petal.Width 1.1
Scale function is used for normalization
pc$scale Sepal.Length Sepal.Width Petal.Length 0.82 0.46 1.79 Petal.Width 0.76
Print the results stored in pc.
print(pc)
while printing pc you will get standard deviations and loadings.
Standard deviations (1, .., p=4): [1] 1.72 0.94 0.38 0.14 Rotation (n x k) = (4 x 4): PC1 PC2 PC3 PC4 Sepal.Length 0.51 -0.398 0.72 0.23 Sepal.Width -0.29 -0.913 -0.26 -0.12 Petal.Length 0.58 -0.029 -0.18 -0.80 Petal.Width 0.56 -0.081 -0.62 0.55
For example, PC1 increases when Sepal Length, Petal Length, and Petal Width are increased and it is positively correlated whereas PC1 increases Sepal Width decrease because these values are negatively correlated.
Naïve Bayes Classification in R
summary(pc) Importance of components: PC1 PC2 PC3 Standard deviation 1.717 0.940 0.3843 Proportion of Variance 0.737 0.221 0.0369 Cumulative Proportion 0.737 0.958 0.9953 PC4 Standard deviation 0.1371 Proportion of Variance 0.0047 Cumulative Proportion 1.0000
The first principal components explain the variability around 73% and its captures the majority of the variability.
In this case, the first two components capture the majority of the variability.
Let’s create the scatterplot based on PC and see the multicollinearity issue is addressed or not?.
pairs.panels(pc$x, gap=0, bg = c("red", "yellow", "blue")[training$Species], pch=21)
Now the correlation coefficients are zero, so we can get rid of multicollinearity issues.
For making biplot need devtools package.
library(devtools) #install_github("vqv/ggbiplot") library(ggbiplot) g <- ggbiplot(pc, obs.scale = 1, var.scale = 1, groups = training$Species, ellipse = TRUE, circle = TRUE, ellipse.prob = 0.68) g <- g + scale_color_discrete(name = '') g <- g + theme(legend.direction = 'horizontal', legend.position = 'top') print(g)
PC1 explains about 73.7% and PC2 explained about 22.1% of variability.
Arrows are closer to each other indicates the high correlation.
For example correlation between Sepal Width vs other variables is weakly correlated.
Another way interpreting the plot is PC1 is positively correlated with the variables Petal Length, Petal Width, and Sepal Length, and PC1 is negatively correlated with Sepal Width.
PC2 is highly negatively correlated with Sepal Width.
Bi plot is an important tool in PCA to understand what is going on in the dataset.
Difference between association and correlation
trg <- predict(pc, training)
Add the species column information into trg.
trg <- data.frame(trg, training[5]) tst <- predict(pc, testing) tst <- data.frame(tst, testing[5])
Multinomial Logistic regression with First Two PCs
Because our dependent variable has 3 level, so we will make use of multinomial logistic regression.
library(nnet) trg$Species <- relevel(trg$Species, ref = "setosa") mymodel <- multinom(Species~PC1+PC2, data = trg) summary(mymodel)
Model out is given below and we used only first two principal components, because majority of the information’s available in first components.
How to measure quality control of the product?
multinom(formula = Species ~ PC1 + PC2, data = trg) Coefficients: (Intercept) PC1 PC2 versicolor 7.23 14 3.2 virginica -0.58 20 3.6 Std. Errors: (Intercept) PC1 PC2 versicolor 188 106 128 virginica 188 106 128 Residual Deviance: 36 AIC: 48
p <- predict(mymodel, trg) tab <- table(p, trg$Species) tab
Lets see the correct classification and miss classifications in training dataset.
p setosa versicolor virginica setosa 45 0 0 versicolor 0 35 3 virginica 0 5 32 1 - sum(diag(tab))/sum(tab)
Misclassification error is 0.067
p1 <- predict(mymodel, tst) tab1 <- table(p1, tst$Species) tab1
Based on the test data set error classification is calculated.
p1 setosa versicolor virginica setosa 5 0 0 versicolor 0 9 3 virginica 0 1 12 1 - sum(diag(tab1))/sum(tab1) 0.13
Misclassification for the testing data set is 13.33%.
The post Principal component analysis (PCA) in R appeared first on finnstats.