<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/atom10full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" gd:etag="W/&quot;CU8DRH0_eyp7ImA9WhBaEk4.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670</id><updated>2013-05-22T10:44:35.343-04:00</updated><category term="Stata" /><category term="densityplot()" /><category term="replicate()" /><category term="clustering" /><category term="rexp()" /><category term="offset option" /><category term="matrix operations" /><category term="bargraph.CI() function" /><category term="axis control" /><category term="lag function" /><category term="Firth option" /><category term="RCurl package" /><category term="rpart package" /><category term="mdy function" /><category term="homoscedasticity" /><category term="subset" /><category term="Monte Carlo experiments" /><category term="time-varying covariates" /><category term="spreadsheets" /><category term="put" /><category term="John Emerson" /><category term="data frames" /><category term="barplot()" /><category term="mosaic plot" /><category term="comparisons" /><category term="Levene's test" /><category term="running average" /><category term="proc mcmc" /><category term="string functions" /><category term="distance" /><category term="latent class model" /><category term="paste()" /><category term="cat()" /><category term="sample()" /><category term="read Excel sheets" /><category term="proc fcmp" /><category term="Benjamini and Hochberg" /><category term="probability" /><category term="resampling based inference" /><category term="sort" /><category term="textConnection()" /><category term="non-monotonic missingness" /><category term="cumulative distribution function" /><category term="baseball" /><category term="logistf package" /><category term="drop statement" /><category term="qt()" /><category term="list of variables" /><category term="hexbin()" /><category term="Rosettacode" /><category term="plotting functions" /><category term="brute force" /><category term="graphics" /><category term="by statement" /><category term="empirical CDF" /><category term="formatting" /><category term="contrasts" /><category term="cumulative hazard" /><category term="transpose" /><category term="multinomial observations" /><category term="prop.test()" /><category term="array statement" /><category term="odds ratio" /><category term="proc glm" /><category term="mvrnorm()" /><category term="negative binomial distribution" /><category term="proc greplay" /><category term="ggplot2 package" /><category term="dim()" /><category term="coef() function" /><category term="predictive mean matching" /><category term="vectors" /><category term="propensity scores" /><category term="Metropolis-Hastings algorithm" /><category term="sas tricks" /><category term="epidemiology" /><category term="point option" /><category term="change variable types" /><category term="Task Views" /><category term="name conflict" /><category term="R function" /><category term="output statement" /><category term="SAS formats" /><category term="lurking variables" /><category term="lattice library" /><category term="Stuart Lipsitz" /><category term="Bland-Altman plot" /><category term="Richard Heiberger" /><category term="Mplus" /><category term="elrm package" /><category term="events/trials syntax" /><category term="one-to-many" /><category term="plot colors" /><category term="mod function" /><category term="count models" /><category term="normality assumption" /><category term="Stein estimator" /><category term="James-Stein estimator" /><category term="survival model" /><category term="Plus 4 estimator" /><category term="large datasets" /><category term="failure time analysis" /><category term="make categories" /><category term="MCMC" /><category term="do loop" /><category term="adding text to graphics" /><category term="reproducible analysis" /><category term="apply()" /><category term="productivity" /><category term="New Year's resolutions" /><category term="libraries in R" /><category term="indicator variables" /><category term="sports statistics" /><category term="proc lca" /><category term="call symput" /><category term="boxplot" /><category term="Louis Aslett" /><category term="R2winbugs" /><category term="data step" /><category term="proc phreg" /><category term="digits of Pi" /><category term="multiple regression" /><category term="annnotate data sets" /><category term="eval statement" /><category term="summary statistics" /><category term="Firth logistic regression" /><category term="sas7bdat package" /><category term="central moments" /><category term="read.sas7bdat()" /><category term="estimate statement" /><category term="remainder" /><category term="University of Auckland" /><category term="college majors" /><category term="round function" /><category term="range of variables" /><category term="class statement" /><category term="cubature library" /><category term="capture.output()" /><category term="probability distributions" /><category term="crowd-sourcing" /><category term="truncated distribution" /><category term="coda package" /><category term="glm() function" /><category term="Thomas Lumley" /><category term="blog aggregators" /><category term="barplots(back to back)" /><category term="retain" /><category term="Hotelling's T" /><category term="social science" /><category term="Read data in SAS" /><category term="radio static" /><category term="quoting" /><category term="false discovery rate" /><category term="adaptIntegrate()" /><category term="diploma problem" /><category term="rep()" /><category term="NP complete" /><category term="with()" /><category term="dotplot" /><category term="Pi" /><category term="proc sgrender" /><category term="Read data in R" /><category term="complex survey design" /><category term="Matt Regan" /><category term="as.factor()" /><category term="latent class analysis" /><category term="runif()" /><category term="standardized regression coefficients" /><category term="proc mianalyze" /><category term="Rick Wicklin Robert Allison" /><category term="custom graphics layout" /><category term="robustness" /><category term="favstats()" /><category term="expected cell counts" /><category term="annotate data set" /><category term="maps package" /><category term="gdata package" /><category term="hilo interpolation" /><category term="write Excel sheets" /><category term="vref optioncall symput" /><category term="ragged input" /><category term="Type I error rate" /><category term="options()" /><category term="attach()" /><category term="regression to the mean" /><category term="proc tabulate" /><category term="proc multtest" /><category term="dotplot/boxplot" /><category term="set ds; by x;" /><category term="ceiling()" /><category term="CRAN" /><category term="t()" /><category term="logic tests" /><category term="arrays" /><category term="integrated development environment" /><category term="Poisson distribution" /><category term="exponential" /><category term="Simpson's paradox" /><category term="ppois()" /><category term="pairs plots" /><category term="JAGS" /><category term="merge statement" /><category term="Minard" /><category term="integration" /><category term="looping" /><category term="proc freq" /><category term="col option" /><category term="probability distributiholons" /><category term="population age" /><category term="generalized pairs plots" /><category term="read sheets" /><category term="Hosmer and Lemeshow" /><category term="skewness()" /><category term="regular expressions" /><category term="John Snow" /><category term="save data in Stata format" /><category term="design matrix" /><category term="saving output from SAS" /><category term="circles" /><category term="plot.ts()" /><category term="ts()" /><category term="Maxine Pfannkuch" /><category term="random variables" /><category term="ceil" /><category term="p.adjust() function" /><category term="axis statement" /><category term="mean()" /><category term="subsetting" /><category term="SAS data sets" /><category term="numeric()" /><category term="association measures" /><category term="comparing models" /><category term="outer() function" /><category term="Royal Statistical Society" /><category term="proc mi" /><category term="standard deviation" /><category term="association plot" /><category term="binomial probability" /><category term="offset axes" /><category term="linear regression" /><category term="assumptions" /><category term="exact statement" /><category term="relevel function" /><category term="matplot()" /><category term="reflabel option" /><category term="recursive partitioning" /><category term="survey sampling" /><category term="where function" /><category term="merge" /><category term="proc univariate" /><category term="histogram" /><category term="date and time values" /><category term="end =" /><category term="readLines()" /><category term="rnorm()" /><category term="confint()" /><category term="factor()" /><category term="stratiification" /><category term="matrices" /><category term="gmodels package" /><category term="regression adjustment" /><category term="href option" /><category term="statistical education" /><category term="multivariate statistics" /><category term="Cramer's V" /><category term="RColorBrewer package" /><category term="API" /><category term="permutation test" /><category term="pyramid plots" /><category term="goodness of fit" /><category term="barchart() function" /><category term="regression trees" /><category term="p.adjust()" /><category term="categorical covariates" /><category term="chisq.test()" /><category term="mice()" /><category term="R-bloggers" /><category term="proc simnormal" /><category term="one-way chi-square test" /><category term="maps" /><category term="relative risk" /><category term="negative binomial regression" /><category term="reshape package" /><category term="binom.test()" /><category term="read data by byte" /><category term="date formats" /><category term="chi-square test" /><category term="sleep apnea" /><category term="sapply()" /><category term="Chris Wild" /><category term="rare disease assumption" /><category term="partykit package" /><category term="R-sig-teaching" /><category term="cut function" /><category term="heat map" /><category term="profile likelihood" /><category term="conditioning" /><category term="layout.show()" /><category term="mice package" /><category term="bayes statement" /><category term="fonts" /><category term="Hochberg procedure" /><category term="OpenBUGS" /><category term="scatterplot" /><category term="binning" /><category term="Durbin-Watson statistic" /><category term="clodds statement" /><category term="proc gproject" /><category term="nobs option" /><category term="sas7bdat format" /><category term="repeated multiples" /><category term="Convert R to SAS" /><category term="layout()" /><category term="proc kde" /><category term="ods graphics on" /><category term="read from URL" /><category term="Monty Hall problem" /><category term="R Inferno" /><category term="Fibonacci series" /><category term="reshape" /><category term="pairwaise comparisons" /><category term="variable number of records" /><category term="substitute function" /><category term="read complex data files" /><category term="dynamite plot" /><category term="pool()" /><category term="ods system" /><category term="R environments" /><category term="Wald CI" /><category term="pointlabel option" /><category term="hat-check problem" /><category term="lapply() function" /><category term="empirical problem solving" /><category term="Philips" /><category term="principal components" /><category term="proc gmap" /><category term="citing R" /><category term="convert categorical class to numeric" /><category term="plot" /><category term="regexp()" /><category term="par()" /><category term="Contour" /><category term="random statement" /><category term="within()" /><category term="order" /><category term="Design package" /><category term="xchisq.test()" /><category term="central limit theorem" /><category term="na.string" /><category term="proc gchart" /><category term="file.info()" /><category term="matlines()" /><category term="proportional hazards assumption" /><category term="polygon()" /><category term="MCMCpack package" /><category term="grid.polyline() function" /><category term="survival analysis" /><category term="back-to-back barplots" /><category term="citing SAS" /><category term="while()" /><category term="a*b=c syntax" /><category term="exchangeability" /><category term="Galton" /><category term="deparse(substitute()" /><category term="determinant" /><category term="Tim Hesterberg" /><category term="sunflowerplot()" /><category term="flexmix package" /><category term="pnbinom()" /><category term="rand function" /><category term="HistData package" /><category term="programming style" /><category term="plot symbols" /><category term="proc genmod" /><category term="shuffle()" /><category term="Clopper-Pearson CI" /><category term="Github" /><category term="cex" /><category term="moments package" /><category term="rounding" /><category term="interactive development environments" /><category term="Excel" /><category term="simulate data" /><category term="MplusAutomation package" /><category term="Ken Beath" /><category term="observed cell counts" /><category term="unaggregated datasets" /><category term="SAS-x" /><category term="levene.test() function" /><category term="table()" /><category term="Nelson-Aalen estimator" /><category term="ifelse()" /><category term="Bayesian methods" /><category term="perl" /><category term="panelby statement" /><category term="set statement options" /><category term="RStudio" /><category term="aggregated datasets" /><category term="exp()" /><category term="%sysevalf" /><category term="glm.nb() function" /><category term="exact logistic regression" /><category term="fitted()" /><category term="set.seed()" /><category term="grid.text() function" /><category term="proc gplot" /><category term="seeds" /><category term="Statistical Sleuth" /><category term="proc sgplot" /><category term="read from local disk" /><category term="proc transpose" /><category term="MANOVA" /><category term="as.POSIXlt()" /><category term="statistics education" /><category term="confounding" /><category term="summary()" /><category term="WinBUGS" /><category term="plotFun()" /><category term="referencing sequential variables" /><category term="Matt Shotwell" /><category term="hexbin package" /><category term="job creation" /><category term="Michael Weylandt" /><category term="mode=include" /><category term="read.table()" /><category term="Bonferroni procedure" /><category term="apply family of functions" /><category term="pch" /><category term="coverage probabilities" /><category term="sd()" /><category term="as.data.frame()" /><category term="title statement" /><category term="rowMeans() function" /><category term="titles" /><category term="World Statistics Day" /><category term="calculus" /><category term="overlay option" /><category term="google spreadsheet" /><category term="unobserved class" /><category term="Bureau of Labor Statistics" /><category term="smoothScatter()" /><category term="le Cessie and Houwelingen" /><category term="multiple comparisons" /><category term="class probabilities" /><category term="symbol statement" /><category term="gps" /><category term="Gamma function" /><category term="ods output statement" /><category term="HELP data set" /><category term="Shangri La" /><category term="pie() function" /><category term="text()" /><category term="customizing plots" /><category term="readline()" /><category term="quadratic equation" /><category term="test statement" /><category term="missing data" /><category term="GGally package" /><category term="mod" /><category term="file print" /><category term="R packages" /><category term="names(). events/trials syntax" /><category term="contrast statement" /><category term="deparse function" /><category term="shuffle() function" /><category term="lawstat package" /><category term="shading regions" /><category term="weight statement" /><category term="teaching statistics" /><category term="mapproj package" /><category term="hazard function" /><category term="Wilson estimator" /><category term="type=&quot;n&quot;" /><category term="read.xlsx()" /><category term="multivariate normal" /><category term="Amazon Sales rank" /><category term="Markov Chain Monte Carlo" /><category term="gsub()" /><category term="projects" /><category term="functions" /><category term="open source" /><category term="variance" /><category term="informal inference" /><category term="simulation studies" /><category term="revision control systems" /><category term="randomLCA package" /><category term="grep() function" /><category term="mixtools package" /><category term="psychology" /><category term="pathological distribution" /><category term="minimum" /><category term="Kaplan-Meier estimates" /><category term="proc fmm" /><category term="parameterization" /><category term="function()" /><category term="survival package" /><category term="Michael Friendly" /><category term="Project MOSAIC" /><category term="snowstorms" /><category term="adjacent observations" /><category term="colnames()" /><category term="CPAN" /><category term="proc logistic" /><category term="Cox proportional hazards model" /><category term="expand.table()" /><category term="SAT scores" /><category term="proc standard" /><category term="matrix()" /><category term="categorical data" /><category term="pdf function" /><category term="finite mixture models" /><category term="MASS library" /><category term="logic" /><category term="xckd" /><category term="match()" /><category term="null hypothesis" /><category term="Tick marks" /><category term="3D plots" /><category term="shrinkage estimator" /><category term="quantile function" /><category term="time series" /><category term="plot()" /><category term="frailty models" /><category term="lines()" /><category term="ylim option" /><category term="new variables" /><category term="Wolfram Alpha" /><category term="social networks" /><category term="Benjamini-Hochberg" /><category term="substr" /><category term="correlated data models" /><category term="kurtosis()" /><category term="input statement" /><category term="CPAP" /><category term="causal inference" /><category term="duplicated data" /><category term="expected value" /><category term="manifest variable" /><category term="Edward Tufte" /><category term="Hadley Wickham" /><category term="mapply() function" /><category term="t-test" /><category term="grammar of graphics" /><category term="read.csv()" /><category term="legend" /><category term="string manipulation" /><category term="two sample comparisons" /><category term="proc sgpanel" /><category term="random number generation" /><category term="skewness" /><category term="FLXPmultinom function" /><category term="match() function" /><category term="debugging" /><category term="measures of association" /><category term="legend statement" /><category term="power calculations" /><category term="reference value" /><category term="proc import" /><category term="mtext()" /><category term="convert SAS to R" /><category term="as.factor" /><category term="factor analysis" /><category term="markerattrs" /><category term="bubble plot" /><category term="dummy variables" /><category term="hexagon" /><category term="as.numeric()" /><category term="Xin Wei" /><category term="read data in Stata format" /><category term="tables" /><category term="spreadsheet" /><category term="missing data modeling" /><category term="age distribution" /><category term="Alan Zaslavsky" /><category term="SAS" /><category term="citation()" /><category term="readBin()" /><category term="sciplot package" /><category term="Amazon web services" /><category term="random numbers" /><category term="rejection sampling" /><category term="connect points" /><category term="foreign library" /><category term="pseudo-random numbers" /><category term="confidence intervals" /><category term="formatted output" /><category term="amherst" /><category term="Poisson regression" /><category term="rjags" /><category term="SAS macro" /><category term="detach()" /><category term="excerpt" /><category term="cyclemeter" /><category term="logistic regression" /><category term="methods()" /><category term="annotate macro" /><category term="descriptive statistics" /><category term="file()" /><category term="symbolic computation" /><category term="t() function" /><category term="poLCA package" /><category term="log scale" /><category term="kurtosis" /><category term="assessing differences" /><category term="rbind()" /><category term="Weibull" /><category term="conditional execution" /><category term="plotrix package" /><category term="teacher salaries" /><category term="pattern statement" /><category term="choropleth" /><category term="style guide" /><category term="RColorBrewer" /><category term="writeXLS package" /><category term="tidying code" /><category term="abline()" /><category term="communicating between SAS and R" /><category term="sequences" /><category term="mosaic package" /><category term="which.min() function" /><category term="legend()" /><category term="proc_r" /><category term="SD card" /><category term="FLXMRglmfix function" /><category term="censored data" /><category term="matrix" /><category term="ladd()" /><category term="latent class models" /><category term="for()" /><category term="colors" /><category term="axis" /><category term="proc template" /><category term="side by side histograms" /><category term="R" /><category term="multiple imputation" /><title>SAS and R</title><subtitle type="html">Examples of tasks replicated in SAS and R</subtitle><link rel="http://schemas.google.com/g/2005#feed" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/posts/default" /><link rel="alternate" type="text/html" href="http://sas-and-r.blogspot.com/" /><link rel="next" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default?start-index=26&amp;max-results=25&amp;redirect=false&amp;v=2" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><generator version="7.00" uri="http://www.blogger.com">Blogger</generator><openSearch:totalResults>157</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/atom+xml" href="http://feeds.feedburner.com/SASandR" /><feedburner:info uri="sasandr" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><link rel="license" type="text/html" href="http://creativecommons.org/licenses/by-nc-sa/3.0/" /><meta xmlns="http://pipes.yahoo.com" name="pipes" content="noprocess" /><logo>http://kenkleinman.net/files/favicon.jpg</logo><feedburner:emailServiceId>SASandR</feedburner:emailServiceId><feedburner:feedburnerHostname>http://feedburner.google.com</feedburner:feedburnerHostname><entry gd:etag="W/&quot;DkQNRnk_fip7ImA9WhNWEUs.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-722574931184401413</id><published>2012-12-10T10:08:00.000-05:00</published><updated>2012-12-10T13:33:17.746-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-12-10T13:33:17.746-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="axis control" /><category scheme="http://www.blogger.com/atom/ns#" term="apply family of functions" /><category scheme="http://www.blogger.com/atom/ns#" term="graphics" /><category scheme="http://www.blogger.com/atom/ns#" term="binomial probability" /><category scheme="http://www.blogger.com/atom/ns#" term="log scale" /><title>Example 10.8: The upper 95% CI is 3.69</title><content type="html">Apologies for the long and unannounced break-- the longest since we started blogging, three and a half years ago.  I was writing a 2-day course for SAS users to learn R.  Contact me if you're interested.  And Nick and I are beginning work on the second edition of our book-- look for it in the fall.  Please let us know if you have ideas about what we omitted last time or would otherwise like to see added.  

In the mean time, we'll keep blogging, though likely at a reduced rate.

&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-MjNlhqMYEe4/UMFcJQUQsOI/AAAAAAAADuI/BWzPfa8yUDM/s1600/plotci0.jpeg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="267" src="http://1.bp.blogspot.com/-MjNlhqMYEe4/UMFcJQUQsOI/AAAAAAAADuI/BWzPfa8yUDM/s400/plotci0.jpeg" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Today: what can you say about the probability of an event if the observed number of events is 0?  It turns out that the upper 95% CI for the probability is 3.69/N.  There's a sweet little paper with some rationale for this, but it's in my other office.  And I couldn't recall the precise value-- so I used SAS and R to demonstrate it to myself.
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;R&lt;/b&gt;
&lt;br /&gt;
&lt;br /&gt;
The R code is remarkably concise.  After generating some Ns, we write a little function to perform the test and extract the (exact) upper 95% confidence limit.  This is facilitated by the "..." notation, which passes along unused arguments to functions.  Then we use &lt;tt&gt;apply()&lt;/tt&gt; to call the new function for each N, passing the numerator 0 each time.  Note that &lt;tt&gt;apply()&lt;/tt&gt; needs a matrix argument, so the simple vector of Ns is converted to a matrix before use.  [The &lt;tt&gt;sapply()&lt;/tt&gt; function will accept a vector input, but took about 8 times as long to run.]  Finally, we plot the upper limit * N against N. showing the asymptote.  A log scaled x-axis is useful here, and is achieved with the &lt;tt&gt;log='x'&lt;/tt&gt; option.  (Section 5.3.12.)  the result is shown above.

&lt;br /&gt;
&lt;pre&gt;bin.m = seq(10, 10000, by=5)
mybt = function(...) { binom.test(...)$conf.int[2] }
uci = apply(as.matrix(bin.m), 1, mybt, x=0)
plot(y=bin.m * uci, x=bin.m, ylim=c(0,4), type="l", 
     lwd=5, col="red", cex=5, log='x',  
     ylab="Exact upper CI", xlab="Sample size", 
     main="Upper CI when there are 0 cases observed")
abline(h=3.69)
&lt;/pre&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;SAS&lt;/b&gt;
&lt;br /&gt;
&lt;br /&gt;
In SAS, the data, really just the N and a numerator of 0, are generated in a &lt;tt&gt;data&lt;/tt&gt; step.  The CI are found using the &lt;tt&gt;binomial&lt;/tt&gt; option in the &lt;tt&gt;proc freq tables&lt;/tt&gt; statement and saved using the &lt;tt&gt;output&lt;/tt&gt; statement.  Note that the &lt;tt&gt;weight&lt;/tt&gt; statement is used here to avoid having a row for each Bernoulli trial.

&lt;br /&gt;
&lt;pre&gt;data binm;
do n = 10 to 10000 by 5;
  x=0;
  output;
  end;
run;

ods select none;
proc freq data=binm;
by n;
weight n;
tables x / binomial;
output out=bp binomial;
run;
ods select all;
&lt;/pre&gt;
To calculate the upper limit*N, another &lt;tt&gt;data&lt;/tt&gt; step is needed-- note that in this setting SAS will only produce the lower limit against the probability that all observations share the same value, thus the subtraction from 1 shown below.  The log scale x-axis is obtained with the &lt;tt&gt;logbase&lt;/tt&gt; option to the &lt;tt&gt;axis&lt;/tt&gt; statement.  (Section 5.3.12.)  The result is shown below.

&lt;br /&gt;
&lt;pre&gt;data uci;
set bp;
limit = (1-xl_bin) * n;
run;

axis1 order = (0 to 4 by 1);
axis2 logbase=10 logstyle=expand;
symbol1 i = j v = none c = red w=5 l=1;
proc gplot data=uci;
plot limit * n / vref=3.69 vaxis=axis1 haxis=axis2;
label n="Sample size" limit="Exact upper CI";
run;
quit;
&lt;/pre&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-HUuZHEeDqfI/UMFc7y72w2I/AAAAAAAADuU/T_n725piWxo/s1600/sasprobci0.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="258" src="http://1.bp.blogspot.com/-HUuZHEeDqfI/UMFc7y72w2I/AAAAAAAADuU/T_n725piWxo/s400/sasprobci0.jpg" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
It's clear that the upper 95% limit on the number of successes asymptotes to about 3.69.  Thus the upper limit on the binomial probability p is 3.69/N.



&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;
We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=mQDg2LuwyXE:DVTsNxj4IpQ:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=mQDg2LuwyXE:DVTsNxj4IpQ:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=mQDg2LuwyXE:DVTsNxj4IpQ:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/mQDg2LuwyXE" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/722574931184401413/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=722574931184401413" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/722574931184401413?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/722574931184401413?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/mQDg2LuwyXE/example-108-upper-95-ci-is-369.html" title="Example 10.8: The upper 95% CI is 3.69" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-MjNlhqMYEe4/UMFcJQUQsOI/AAAAAAAADuI/BWzPfa8yUDM/s72-c/plotci0.jpeg" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/12/example-108-upper-95-ci-is-369.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUIMQXY9eip7ImA9WhNSFUw.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-2640550375796912481</id><published>2012-10-29T09:33:00.000-04:00</published><updated>2012-10-29T09:33:00.862-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-29T09:33:00.862-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="replicate()" /><category scheme="http://www.blogger.com/atom/ns#" term="ods output statement" /><category scheme="http://www.blogger.com/atom/ns#" term="by statement" /><category scheme="http://www.blogger.com/atom/ns#" term="while()" /><title>Example 10.7: Fisher vs. Pearson</title><content type="html">&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-J4lf47prvaU/UIWoOhSuicI/AAAAAAAADtw/j1VRjt_lxTI/s1600/fp%2Bplot%2Bsas.bmp" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="420" src="http://4.bp.blogspot.com/-J4lf47prvaU/UIWoOhSuicI/AAAAAAAADtw/j1VRjt_lxTI/s1600/fp%2Bplot%2Bsas.bmp" width="530" /&gt;&lt;/a&gt;&lt;/div&gt;
In the early days of the discipline of statistics, R.A. Fisher argued with great vehemence against Egon Pearson (and Jerzy Neyman) over the foundational notions supporting statistical inference.  The personal invective recorded is somewhat amusing and also reminds us how very puerile even very smart people can be.
&lt;br /&gt;
&lt;br /&gt;
Today, we'll compare Fisher's exact test for 2*2 tables with the Pearson chi-square, developed by Karl Pearson, Egon's father and another early pioneer of statistics.  This blog entry was inspired by a questioner on LinkedIn who asked when should the one be preferred over the other.  One commenter gave the typical rule of thumb-- "If the expected count in any cell is less than 5, use the exact test, otherwise use the chi-square."  My copy of "&lt;a href="https://www.amazon.com/dp/0470144483/ref=as_li_ss_til?tag=sasandrblog-20&amp;amp;camp=0&amp;amp;creative=0&amp;amp;linkCode=as4&amp;amp;creativeASIN=0470144483&amp;amp;adid=1GASPWFP55CXDF1T0XEX&amp;amp;"&gt;Statistical rules of thumb&lt;/a&gt;" is AWOL at the moment, so I don't know if this one is covered there.  A quick googling did not reveal an answer either.
&lt;br /&gt;
&lt;br /&gt;
The rule of thumb dates back to the days before the exact test became computationally feasible outside of small problems.  In contrast, today it can be performed quickly for all tables, either through a complete enumeration of the possible tables or through Monte Carlo hypothesis testing, which is simple to apply in either SAS or R.  My default in recent years has been to take advantage of this capability and use the exact test all the time, ignoring the traditional approximate chi-square test.  My idea was that if there were any small cells, I'd be covered, while allowing a simpler methods section.
&lt;br /&gt;
&lt;br /&gt;
Let's develop some code to see what happens.  Is the rule of thumb accurate?  What's the power cost of using the exact test instead of the Chi-square?
&lt;br /&gt;
&lt;br /&gt;
Our approach will be to set cell probabilities and the sample size, and simulate data under this model, then perform each test and evaluate the proportion of rejections under each test.  One complication here is that simulated data might result in a null margin, i.e., there might be no observed values in a row or in a column.  We'll calculate rejections of the null only among the tables where this does not happen.  This means that the average observed cell counts among included tables may be different from the expect cell counts.  This makes sense from a practical perspective-- we probably would not do the test if we observed 0 subjects in one of our planned categories.
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;SAS&lt;/b&gt;
&lt;br /&gt;
In SAS, we'll do the dumb straightforward thing and simulate 100 pairs of dichotomous variables.  Here we just code the null case of no association, with margins of 70% in one column and 80% in one row.  The smallest cell has an expected count of 6%, so that a total sample size of 83 will have an expected count of 5 in that cell.  
&lt;br /&gt;
&lt;pre&gt;data test;
pdot1 = .7;
p1dot = .8;
do tablen = 20, 50, 100, 200, 500, 1000;
  do ds = 1 to 10000;
    do i = 1 to tablen;
      xnull = uniform(0) gt pdot1;
      ynull = uniform(0) gt p1dot;
      output;
      end;
    end;
  end;
run;
&lt;/pre&gt;
Then &lt;tt&gt;proc freq&lt;/tt&gt; can be used to generate the two p-values, using the &lt;tt&gt;by&lt;/tt&gt; statement to do the calculations for all the tables at once.  The &lt;tt&gt;output&lt;/tt&gt; statement extracts the p-values into a data set.
&lt;br /&gt;
&lt;pre&gt;ods select none;
options nonotes;
proc freq data = test;
by tablen ds;
tables ynull * xnull / chisq fisher;
output out = kk1 chisq fisher;
run;
options notes;
ods select all;
&lt;/pre&gt;
To get the proportion of rejections, we first use a &lt;tt&gt;data&lt;/tt&gt; step to calculate whether each test was rejected, then go back to &lt;tt&gt;proc freq&lt;/tt&gt; to find the proportion of rejections and the CI on the probability of rejections.
&lt;br /&gt;
&lt;pre&gt;data summ;
set kk1 (keep = tablen p_pchi xp2_fish);
rej_pchi = (p_pchi lt 0.05);
rej_fish = (xp2_fish lt .05);
run;

ods output binomialprop = kk2;
proc freq data = summ;
by tablen;
tables rej_pchi rej_fish / binomial(level='1');
run;
&lt;/pre&gt;
You may have noticed that we didn't do anything to account for the tables with empty rows or columns.  When the initial &lt;tt&gt;proc freq&lt;/tt&gt; encounters such a table, it performs neither test.  Thus the second &lt;tt&gt;proc freq&lt;/tt&gt; is calculating the proportion and CI with a denominator that might be smaller than the number of tables we simulated.  Happily, they'll still be correct, though the CI may be wider than we'd intended.

Finally, we're ready to plot the results, using the &lt;tt&gt;hilob&lt;/tt&gt; interpolation described in Example &lt;a href="http://sas-and-r.blogspot.com/2012/10/example-104-multiple-comparisons-and.html"&gt;10.4&lt;/a&gt;.  Using &lt;tt&gt;hiloc&lt;/tt&gt; instead shows the "close" as a tick mark between the high and low values. 
&lt;br /&gt;
&lt;pre&gt;data kk2a;
set kk2;
if table eq "Table rej_pchi" then tablen = tablen + 1;
run;

symbol1 i = hiloc;
symbol2 i = hiloc;
proc gplot data = kk2a;
where name1 in ("_BIN_","XL_BIN","XU_BIN");
plot nvalue1 * tablen = table / vref = 0.05 href=83;
/* ref lines where the expected count in the smallest cell is &amp;gt; 5, 
and the nominal alpha */
run; quit;
&lt;/pre&gt;
The results are shown above.  The confidence limits should include 0.05 for all numbers of subjects in order to be generally recommended.  Both tests reach this standard, with these margins, even for tables with only 20 subjects, i.e., with expected cell counts of 11, 5, 3, and 1.  The exact test appears conservative (rejects less than 5% of the time), probably due to small cell counts and the resulting ties in the list of possible tables.

&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;R&lt;/b&gt;
&lt;br /&gt;
In R, we'll simulate observations from a multinomial distribution with the desired cell probabilities, and assemble the result into a table to calculate the p-values.  This will make it easier to simulate tables under the alternative, as we need to do to assess power.  If there are empty rows or columns, the &lt;tt&gt;chisq.test()&lt;/tt&gt; function produces a p-value of "NaN", which will create problems later.  To avoid this, we'll put the table generation inside a &lt;tt&gt;while()&lt;/tt&gt; function.  This operates like the &lt;tt&gt;do while&lt;/tt&gt; construction in SAS (and other programming languages).  The condition we check for is whether there is a row or column with 0 observations; if so, try generating the data again.  We begin by initializing the table with 0's. 
&lt;br /&gt;
&lt;pre&gt;makeitm = function(n.in.table, probs)  {
   myt = matrix(rep(0,4), ncol=2)
  while( (min(colSums(myt)) == 0) | (min(rowSums(myt)) == 0) )  { 
    myt = matrix(rmultinom(n=1, size=n.in.table, probs), ncol=2,byrow=TRUE)
}
  chisqp = chisq.test(myt, correct=FALSE)$p.value
  fishp = fisher.test(myt)$p.value
  return(c(chisqp, fishp))
}
&lt;/pre&gt;
With this basic building block in place, we can write a function to call it repeatedly (using the &lt;tt&gt;replicate()&lt;/tt&gt; function, then calculate the proportion of rejections and get the CI for the probability of rejections.
&lt;br /&gt;
&lt;pre&gt;many.ns = function(tablen, nds,probs) {
  res1 = t(replicate(nds,makeitm(tablen,probs)))
  res2 = res1 &amp;lt; 0.05
  pear = sum(res2[,1])/nds
  fish = sum(res2[,2])/nds
  pearci = binom.test(sum(res2[,1]),nds)$conf.int[1:2]
  fishci = binom.test(sum(res2[,2]),nds)$conf.int[1:2]
  return(c("N.ds" = nds, "N.table" = tablen, probs, 
          "Pearson.p" = pear, "PCIl"=pearci[1], "PCIu"=pearci[2],
          "Fisher.p" = fish, "FCIl" = fishci[1], "FCIu" = fishci[2]))
}
&lt;/pre&gt;
Finally, we're ready to actually do the experiment, using &lt;tt&gt;sapply()&lt;/tt&gt; to call the function that calls &lt;tt&gt;replicate()&lt;/tt&gt; to call the function that makes a table.  The result is converted to a data frame to make the plotting simpler.  The first call below replicates the SAS result shown above and has very similar estimates and CI, but is not displayed here.  The second shows an odds ratio of 3, the third (plotted below) has an OR of 1.75, and the last an OR of 1.5.
&lt;br /&gt;
&lt;pre&gt;#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=10000, 
  probs = c(.56,.24,.14,.06))))
#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000, 
  probs = c(.6,.2,.1,.1))))
res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000, 
  probs = c(.58,.22,.12,.08))))
#res3 = data.frame(t(sapply(c(20, 50, 100, 200, 500, 1000),many.ns, nds=1000, 
  probs = c(.57,.23,.13,.07))))

with(res3,plot(x = 1, y =1, type="n", ylim = c(0, max(c(PCIu,FCIu))), xlim=c(0,1000),
               xlab = "N in table", ylab="Rejections", main="Fisher vs. Pearson"))
with(res3,points(y=Pearson.p, x=N.table,col="blue"))
with(res3,segments(x0 = N.table, x1=N.table, y0 = PCIl, y1= PCIu, col = "blue"))
with(res3,points(y=Fisher.p, x=N.table + 2,col="red"))
with(res3,segments(x0 = N.table+2, x1=N.table+2, y0 = FCIl, y1= FCIu, col = "red"))
abline(v=83)
abline(h=0.05)
&lt;/pre&gt;
The plotting commands used above have been demonstrated in Examples &lt;a href="http://sas-and-r.blogspot.com/2012/10/example-104-multiple-comparisons-and.html"&gt;10.4&lt;/a&gt; and &lt;a href="http://sas-and-r.blogspot.com/2010/11/example-813-bike-ride-plot-part-2.html"&gt;8.42&lt;/a&gt;.

&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-oVn9ohiwaUQ/UIWaCDN0ZUI/AAAAAAAADtM/eTlLXUSq8SY/s1600/fp_plot.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="300" src="http://4.bp.blogspot.com/-oVn9ohiwaUQ/UIWaCDN0ZUI/AAAAAAAADtM/eTlLXUSq8SY/s400/fp_plot.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
Overall, the results show (in the SAS plot, at the top) that the Pearson chi-square test does perfectly well at protecting the alpha level under the null with these margins, even when the expected number of cases in one cell is as small as 1.  In contrast, compared to the exact test, the Chi-square test has a bit more power, for these cell probabilities.  The power difference is greatest when the N is smaller.  Given this example, I would say that the rule of thumb may be too conservative, pushing people away from a more powerful test unnecessarily.  In the future, I plan to be more positive about using the Pearson chi-square.
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;
We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=Ip4jvoT9w6Q:6sNNpvAPzpc:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=Ip4jvoT9w6Q:6sNNpvAPzpc:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=Ip4jvoT9w6Q:6sNNpvAPzpc:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/Ip4jvoT9w6Q" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/2640550375796912481/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=2640550375796912481" title="6 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2640550375796912481?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2640550375796912481?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/Ip4jvoT9w6Q/example-107-fisher-vs-pearson.html" title="Example 10.7: Fisher vs. Pearson" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-J4lf47prvaU/UIWoOhSuicI/AAAAAAAADtw/j1VRjt_lxTI/s72-c/fp%2Bplot%2Bsas.bmp" height="72" width="72" /><thr:total>6</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/10/example-107-fisher-vs-pearson.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0MEQHg-cSp7ImA9WhNTE0w.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-6374829838861909116</id><published>2012-10-15T10:30:00.000-04:00</published><updated>2012-10-15T10:30:01.659-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-15T10:30:01.659-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="glm() function" /><category scheme="http://www.blogger.com/atom/ns#" term="replicate()" /><category scheme="http://www.blogger.com/atom/ns#" term="merge statement" /><category scheme="http://www.blogger.com/atom/ns#" term="proc genmod" /><category scheme="http://www.blogger.com/atom/ns#" term="Wald CI" /><category scheme="http://www.blogger.com/atom/ns#" term="glm.nb() function" /><category scheme="http://www.blogger.com/atom/ns#" term="coef() function" /><category scheme="http://www.blogger.com/atom/ns#" term="rowMeans() function" /><category scheme="http://www.blogger.com/atom/ns#" term="Poisson regression" /><category scheme="http://www.blogger.com/atom/ns#" term="negative binomial regression" /><category scheme="http://www.blogger.com/atom/ns#" term="logic tests" /><title>Example 10.6: Should Poisson regression ever be used? Negative binomial vs. Poisson regression</title><content type="html">In practice, we often find that count data is not well modeled by Poisson regression, though Poisson models are often presented as the natural approach for such data.  In contrast, the negative binomial regression model is much more flexible and is therefore likely to fit better, if the data are not Poisson.  

In &lt;a href="http://sas-and-r.blogspot.com/2011/03/example-830-compare-poisson-and.html"&gt;example 8.30&lt;/a&gt; we compared the probability mass functions of the two distributions, and found that for a given mean, the negative binomial closely approximates the Poisson, as the scale parameter increases.  But how does this affect the choice of regression model?  How might another alternative, the overdispersed, or quasi-Poisson model compete with these?  Today we generate a rudimentary toolkit for assessing the effects of Poisson, negative binomial, and quasi-Poisson models, assuming data are truly generated by one or the other process.
&lt;br&gt;
&lt;br&gt;

&lt;b&gt;SAS&lt;/b&gt;
&lt;br&gt;
We'll begin by simulating Poisson and negative binomial data.  Note that we also rely on the &lt;tt&gt;poismean_nb&lt;/tt&gt; function that we created in &lt;a href="http://sas-and-r.blogspot.com/2011/03/example-830-compare-poisson-and.html"&gt;example 8.30&lt;/a&gt;-- this is needed because SAS only accepts the natural parameters of the distribution, while the mean is a (simple) function of the two parameters.

As is typical in such settings, we'll begin by generating data under the null of no association between, in this case, the normal covariate and the count outcome.  The proportion of rejections should be no greater than alpha (5%, here).  However, we'll include code to easily simulate data under the alternative as well.  This will facilitate assessing the relative power of the models, later.   
&lt;pre&gt;
data nbp;
do ds = 1 to 10000;
  do i = 0 to 250;
    x = normal(0);
    mean = exp(-.25 + (0 * x));
    pois = rand("POISSON",mean);
    nb1 = rand("NEGBINOMIAL", poismean_nb(mean, 1), 1);
    output;
    end;
  end;
run;
&lt;/pre&gt;
The models will be fit in &lt;tt&gt;proc genmod&lt;/tt&gt;. (See sections 4.1.3, 4.1.5, table 4.1.)  It would be good to write a little macro to change the distribution and the output names, but it's not necessary.  To save space here, the repetitive lines are omitted.  The naming convention is that the true distribution (p or nb) is listed first, followed by the fit model (p, nb, or pod, for overdispersed).
&lt;pre&gt; 
ods select none;
ods output parameterestimates = ppests;
proc genmod data = nbp;
by ds;
model pois = x /dist=poisson;
run;

ods output parameterestimates = ppodests;
model pois = x /dist=poisson scale = p;

ods output parameterestimates = pnbests;
model pois = x /dist=negbin;

ods output parameterestimates = nbnbests;
model nb1 = x /dist=negbin;

ods output parameterestimates = nbpests;
model nb1 = x /dist=poisson;

ods output parameterestimates = nbpodests;
model nb1 = x /dist=poisson scale=p;
ods select all;
&lt;/pre&gt;
For analysis, we'll bring all the results together using the &lt;tt&gt;merge&lt;/tt&gt; statement (section 1.5.7).  Note that the output data sets contain the Wald CI limits as well as the estimates themselves; all have to be renamed in the merge, or they will overwrite each other.
&lt;pre&gt;
data results;
merge
ppests (rename = (estimate = pp_est lowerwaldcl = pp_l 
  upperwaldcl = pp_u))
ppodests (rename = (estimate = ppod_est lowerwaldcl = ppod_l 
  upperwaldcl = ppod_u))
pnbests (rename = (estimate = pnb_est lowerwaldcl = pnb_l 
  upperwaldcl = pnb_u))
nbnbests (rename = (estimate = nbnb_est lowerwaldcl = nbnb_l 
  upperwaldcl = nbnb_u))
nbpests (rename = (estimate = nbp_est lowerwaldcl = nbp_l 
  upperwaldcl = nbp_u))
nbpodests (rename = (estimate = nbpod_est lowerwaldcl = nbpod_l 
  upperwaldcl = nbpod_u));
where parameter eq "x";
target = 0;
pprej =   ((pp_l gt target) or (pp_u lt target));
ppodrej =   ((ppod_l gt target) or (ppod_u lt target));
pnbrej =  ((pnb_l gt target) or (pnb_u lt target));
nbnbrej = ((nbnb_l gt target) or (nbnb_u lt target));
nbprej =  ((nbp_l gt target) or (nbp_u lt target));
nbpodrej =  ((nbpod_l gt target) or (nbpod_u lt target));
run;
&lt;/pre&gt;
The indicators of CI that exclude the null are calculated with appropriate names using logical tests that are 1 if true (rejections) and 0 if false. (See, e.g., section 1.4.9.) The final results can be obtained from &lt;tt&gt;proc means&lt;/tt&gt;
&lt;pre&gt;
proc means data = results; 
var pp_est ppod_est pnb_est nbnb_est nbp_est nbpod_est 
    pprej ppodrej pnbrej nbnbrej nbprej nbpodrej; 
run;

                     Variable             Mean
                     -------------------------
                     pp_est       -0.000349738
                     ppod_est     -0.000349738
                     pnb_est      -0.000344668
                     nbnb_est        0.0013738
                     nbp_est         0.0013588
                     nbpod_est       0.0013588
                     pprej           0.0505000
                     ppodrej         0.0501000
                     pnbrej          0.0468000
                     nbnbrej         0.0535000
                     nbprej          0.1427000
                     nbpodrej        0.0555000
                     -------------------------


&lt;/pre&gt;
All of the estimates appear to be unbiased.  However, Poisson regression, when applied to the truly negative binomial data, appears to be dramatically anticonservative, rejecting the null (i.e., with CI excluding the null value) 14% of the time.  The overdispersed model may be slightly biased as well.  The estimated proportion of rejections is 5.55%, or 555 of 10,000 experiments.  An exact CI for the proportion excludes 5%, here, although the anticonservative bias appears to be slight.  To test other effect sizes, we'd change the mean, set in the first &lt;tt&gt;data&lt;/tt&gt; step and the target in the &lt;tt&gt;results&lt;/tt&gt; data.  It would also be valuable to change the scale parameter for the negative binomial.
&lt;br&gt;
&lt;br&gt;
&lt;br&gt;

&lt;b&gt;R&lt;/b&gt;
&lt;br&gt;
We begin by defining two simple functions: one to extract the standard errors from a model, and the second to assess whether Wald-type CI for parameter estimates exclude some value.  It's a bit confusing that a standard error extracting function is not part of R.  Or perhaps it is, and someone will point out the obvious function in the comments.  It's useful to use the standard errors and construct the Wald CI in the current setting because the obvious alternative for constructing CI, the &lt;tt&gt;confint()&lt;/tt&gt; function, uses profile likelihoods, which would be too time-consuming in a simulation setting.  The second function accepts the parameter estimate, its standard error, and a fixed value which we want to know is in or out of the CI.  Both functions are actually single expressions, but having them in hand will reduce the typing in the main function.


&lt;pre&gt;
# this will work for any model object that works with vcov()
# the test for positive variance should be unnecessary but can't hurt
stderrs = function(model) {
  ifelse(min(diag(vcov(model))) &gt; 0, sqrt(diag(vcov(model))), NA)  
}

# short and sweet: 1 if target is out of Wald CI, 0 if in
ciout = function(est, se, target){
  ifelse( (est - 1.96*se &gt; target) | (est + 1.96*se &lt; target), 1,0)
}
&lt;/pre&gt;
With these ingredients prepared, we're ready to write a function to fit the three models to the two sets of observed data.  The function will accept a number of observations per data set and a true beta.  The Poisson and overdispersed Poisson are fit with the &lt;tt&gt;glm()&lt;/tt&gt; function (section 4.1.3, table 4.1) but the negative binomial uses the &lt;tt&gt;glm.nb()&lt;/tt&gt; function found in the MASS package (section 4.1.5).
&lt;pre&gt;
testpnb = function(n, beta) {
# make data
n = 250
x = rnorm(n)
mean = exp(-.25 + (beta * x))
pois = rpois(n,mean)
nb1 = rnbinom(n, size=1, mu=mean)

# fit models to Poisson data
pp = glm(pois ~x, family="poisson")
ppod = glm(pois ~x, family="quasipoisson")
pnb = glm.nb(pois~x)

# fit models to nb data
nbnb = glm.nb(nb1 ~x)
nbp = glm(nb1 ~x, family="poisson")
nbpod = glm(nb1 ~x, family="quasipoisson")

# extract parameter estimates using the coef() function
est = as.numeric(c(coef(pp)[2], coef(ppod)[2], coef(pnb)[2], coef(nbnb)[2], coef(nbp)[2], coef(nbpod)[2]))
# use our two new functions to get the SE and the CI indicator
se = c(stderrs(pp), stderrs(ppod), stderrs(pnb), stderrs(nbnb), stderrs(nbp), stderrs(nbpod))
ci = ciout(est, se, rep(beta,6))
return(matrix(c(est,se,ci),ncol=3))
}
&lt;/pre&gt;
Now we can use the convenient &lt;tt&gt;replicate()&lt;/tt&gt; function to call the experiment many times.  Since the output of &lt;tt&gt;testnb()&lt;/tt&gt; is a matrix, the result of &lt;tt&gt;replicate()&lt;/tt&gt; is a three-dimensional matrix, R * C * sheet, where sheet here corresponds to each experimental replicate.  To summarize the results, we can use the &lt;tt&gt;rowMeans()&lt;/tt&gt; function to get the proportion of rejections or the mean of the estimates. 
&lt;pre&gt;
mainout = replicate(10000,testpnb(250,0))

# the [,3,] below means "all rows, column 3, all sheets"
&gt; rowMeans(mainout[,3,])
[1] 0.0490 0.0514 0.0463 0.0490 0.1403 0.0493

&gt; rowMeans(mainout[,1,])
[1]  0.0003482834  0.0003482834  0.0003558526 -0.0004123949 -0.0003972441 -0.0003972441
&lt;/pre&gt;
The results agree completely with the SAS results discussed above.

The naive Poisson regression would appear a bad idea--if the data are negative binomial, tests don't have the nominal size.  It would be valuable to replicate the experiment with some other distribution for the real data as well.  One approach to modeling count data would be to fit the Poisson and assess the quality of the fit, which can be done in several ways.  However, this iterative fitting also jeopardizes the size of the test, in theory.  Perhaps we'll explore the practical impact of this in a future entry. Fortunately, at least in this limited example, a nice alternative exists: We can just fit the negative binomial by default.  The costs of this in terms of power could be assessed with a thorough simulation study, but are likely to be small, since only one additional parameter is estimated.  And the size of the test is hardly affected at all.  The quasi-Poisson model could also be recommended, but has the drawback of relying on what is actually not a viable distribution for the data.  Some sources suggest that it may be even more flexible than the negative binomial, however.
&lt;br&gt;
&lt;br&gt;

&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;
We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=FpsF4jn4OcA:mOfk_IL1DfY:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=FpsF4jn4OcA:mOfk_IL1DfY:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=FpsF4jn4OcA:mOfk_IL1DfY:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/FpsF4jn4OcA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/6374829838861909116/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=6374829838861909116" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6374829838861909116?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6374829838861909116?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/FpsF4jn4OcA/example-106-should-poisson-regression.html" title="Example 10.6: Should Poisson regression ever be used? Negative binomial vs. Poisson regression" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/10/example-106-should-poisson-regression.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkUMQXkzfip7ImA9WhJaF00.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-7397095525993326182</id><published>2012-10-08T10:58:00.000-04:00</published><updated>2012-10-08T10:58:00.786-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-08T10:58:00.786-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="factor()" /><category scheme="http://www.blogger.com/atom/ns#" term="proc freq" /><category scheme="http://www.blogger.com/atom/ns#" term="as.numeric()" /><category scheme="http://www.blogger.com/atom/ns#" term="convert categorical class to numeric" /><category scheme="http://www.blogger.com/atom/ns#" term="table()" /><category scheme="http://www.blogger.com/atom/ns#" term="set ds; by x;" /><title>Example 10.5: Convert a character-valued categorical variable to numeric</title><content type="html">&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/--rZEaB6qxM8/UGMvBct0aiI/AAAAAAAADs0/GRojQ2K50Zg/s1600/cat.JPG" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="93" width="178" src="http://3.bp.blogspot.com/--rZEaB6qxM8/UGMvBct0aiI/AAAAAAAADs0/GRojQ2K50Zg/s400/cat.JPG" /&gt;&lt;/a&gt;&lt;/div&gt;




In some settings it may be necessary to recode a categorical variable with character values into a variable with numeric values.  For example, the matching macro we discussed in &lt;a href="http://sas-and-r.blogspot.com/2010/05/example-735-propensity-score-matchingn.html"&gt;example 7.35&lt;/a&gt; will only match on numeric variables.  One way to convert character variables to numeric values is to determine which values exist, then write a possibly long series of conditional tests to assign numbers to the values.  Surely there's a better way?&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;
In SAS, Rick Wicklin offers an &lt;a href="http://blogs.sas.com/content/iml/2011/11/30/recoding-a-character-variable-as-numeric/"&gt;IML solution&lt;/a&gt; and links to a macro with the same function.  But if you're not an IML coder, and you don't want to investigate a macro solution, it's simple enough to do with data steps.

We'll begin by making some fake data.
&lt;br /&gt;
&lt;pre&gt;data test;
  do i = 1 to 100;
  cat = "meow";
  if i gt 30 then cat = "Purr";
  if i gt 70 then cat = "Hiss";
  output;
  end;
run;
&lt;/pre&gt;
To make the new variable, we'll just sort (section 1.5.6) the data on the categorical variable we want to convert, then use the &lt;tt&gt;set ds; by x;&lt;/tt&gt; syntax to keep track of when a new value is encountered in the data.  It's hard to believe that we've never demonstrated this useful syntax before-- perhaps we just can't find it today.  The &lt;tt&gt;set ds; by x;&lt;/tt&gt; syntax makes new temporary variables &lt;tt&gt;first.x&lt;/tt&gt; and &lt;tt&gt;last.x&lt;/tt&gt; that are equal to 1 for the first and last observations of each new level of &lt;tt&gt;x&lt;/tt&gt;, respectively, and 0 otherwise.  When we find a new value, we'll increase a counter by 1; the counter is our new numeric-valued variable.
&lt;br /&gt;
&lt;pre&gt;proc sort data = test; by cat; run;

data catize;
set test;
by cat;
retain catnum 0;
if first.cat then catnum = catnum + 1;
run;

/* check the result */
proc freq data = catize;
tables cat * catnum;
run;
&lt;/pre&gt;
The table also shows the recoding values.
&lt;br /&gt;
&lt;pre&gt;                             Table of cat by catnum

                  cat       catnum

                  Frequency|
                  Percent  |
                  Row Pct  |
                  Col Pct  |       1|       2|       3|  Total
                  ---------+--------+--------+--------+
                  Hiss     |     30 |      0 |      0 |     30
                           |  30.00 |   0.00 |   0.00 |  30.00
                           | 100.00 |   0.00 |   0.00 |
                           | 100.00 |   0.00 |   0.00 |
                  ---------+--------+--------+--------+
                  Purr     |      0 |     40 |      0 |     40
                           |   0.00 |  40.00 |   0.00 |  40.00
                           |   0.00 | 100.00 |   0.00 |
                           |   0.00 | 100.00 |   0.00 |
                  ---------+--------+--------+--------+
                  meow     |      0 |      0 |     30 |     30
                           |   0.00 |   0.00 |  30.00 |  30.00
                           |   0.00 |   0.00 | 100.00 |
                           |   0.00 |   0.00 | 100.00 |
                  ---------+--------+--------+--------+
                  Total          30       40       30      100
                              30.00    40.00    30.00   100.00

&lt;/pre&gt;
&lt;br /&gt;
&lt;b&gt;R&lt;/b&gt;
&lt;br /&gt;
We begin by making the data.  To convert to numbers, we use the &lt;tt&gt;labels&lt;/tt&gt; option to the &lt;tt&gt;factor()&lt;/tt&gt; function, feeding it the sequences of numbers between 1 and however many different values there are.  Note that we find this using the &lt;tt&gt;factor()&lt;/tt&gt; function again.  There's probably a better way of doing this, but it's a little bit amusing to code it this way.  Then we have numbers, but they're store as a factor.  We can get them out with a call to &lt;tt&gt;as.numeric()&lt;/tt&gt;.
&lt;br /&gt;
&lt;pre&gt;cat = c(rep("meow",30),rep("Hiss",30), rep("Purr", 40))
catn1 = factor(cat, labels=(1:length(levels(factor(cat)))))
catn = as.numeric(catn1)
table(catn,cat)

    cat
catn Hiss meow Purr
   1   30    0    0
   2    0   30    0
   3    0    0   40

&lt;/pre&gt;
There's a warning in the documentation for &lt;tt&gt;factor()&lt;/tt&gt; that the values are assigned in location-specific fashion, so the table should be used to establish how the codes were assigned.&amp;nbsp; For the record, the use cases for this kind of recoding in R may be more strained than the SAS example given above.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;
We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=pHSdXnE1U_g:6sRUPUQiiMw:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pHSdXnE1U_g:6sRUPUQiiMw:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=pHSdXnE1U_g:6sRUPUQiiMw:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/pHSdXnE1U_g" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/7397095525993326182/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=7397095525993326182" title="3 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/7397095525993326182?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/7397095525993326182?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/pHSdXnE1U_g/example-105-convert-character-valued.html" title="Example 10.5: Convert a character-valued categorical variable to numeric" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/--rZEaB6qxM8/UGMvBct0aiI/AAAAAAAADs0/GRojQ2K50Zg/s72-c/cat.JPG" height="72" width="72" /><thr:total>3</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/10/example-105-convert-character-valued.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkcCR38_eyp7ImA9WhJaEk0.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-6441609191224498915</id><published>2012-10-01T12:18:00.000-04:00</published><updated>2012-10-02T16:01:06.143-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-02T16:01:06.143-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="symbol statement" /><category scheme="http://www.blogger.com/atom/ns#" term="replicate()" /><category scheme="http://www.blogger.com/atom/ns#" term="a*b=c syntax" /><category scheme="http://www.blogger.com/atom/ns#" term="t()" /><category scheme="http://www.blogger.com/atom/ns#" term="hilo interpolation" /><title>Example 10.4: Multiple comparisons and confidence limits</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-QU--sM2LKQw/UFNAD2qLeeI/AAAAAAAADsQ/ZqkpAL3v9Hw/s1600/CIsimsr.jpeg"&gt;&lt;img alt="" border="0" id="BLOGGER_PHOTO_ID_5788040381057432034" src="http://4.bp.blogspot.com/-QU--sM2LKQw/UFNAD2qLeeI/AAAAAAAADsQ/ZqkpAL3v9Hw/s1600/CIsimsr.jpeg" style="cursor: hand; cursor: pointer; display: block; height: 450px; margin: 0px auto 10px; text-align: center; width: 600px;" /&gt;&lt;/a&gt;&lt;br /&gt;
A colleague is a devotee of confidence intervals.  To him, the CI have the magical property that they are immune to the multiple comparison problem-- in other words, he feels its OK to look at a bunch of 95% CI and focus on the ones that appear to exclude the null.  This though he knows well the one-to-one relationship between 95% CIs that exclude the null and p-values below 0.05.  &lt;br /&gt;
&lt;br /&gt;
Today, we'll create a Monte Carlo experiment to  demonstrate that fishing by CI is just as dangerous as fishing by p-value; generating the image above.  We'll do this by replicating a bivariate experiment 100 times.&amp;nbsp; Later, we'll examine the results of a single experiment with many predictors.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;
To begin with, we'll write a function to generate a single experiment, using a logistic regression.  This is a simple modification of one of our &lt;a href="http://sas-and-r.blogspot.com/2009/06/example-72-simulate-data-from-logistic.html"&gt;first&lt;/a&gt; and &lt;a href="http://sas-and-r.blogspot.com/2012/07/third-year-wrap-up.html"&gt;most popular &lt;/a&gt;entries.&lt;br /&gt;
&lt;pre&gt;simci = function(){
  intercept = 0
  beta = 0
# beta = 0 because we're simulating under the null 
# make the variance of x in this experiment vary a bit
  xtest = rnorm(1000) * runif(1,.6,1.4)
  linpred = intercept + xtest*beta
  prob = exp(linpred)/(1 + exp(linpred))
  runis = runif(1000)
  ytest = ifelse(runis &amp;lt; prob,1,0)
# now, fit the model
  fit = glm(ytest~xtest,family=binomial(link="logit"))
# the standard error of the estimates is easiest to find in the
  pe = summary(fit)$coefficients
# calculate the Wald CI; an alternative would be confint(), but
# that calculated profile CI, which take longer to generate
  ci = exp(c(pe[2,1] - 1.96*pe[2,2], pe[2,1] + 1.96*pe[2,2] ))
  return(ci)
}&lt;/pre&gt;
&lt;br /&gt;
Then we can use the &lt;tt&gt;replicate()&lt;/tt&gt; function to repeat the experiment 100 times.  The &lt;tt&gt;t()&lt;/tt&gt; function (section 1.9.2) transposes the resulting matrix to have one row per experiment.&lt;br /&gt;
&lt;pre&gt;sim100 = t(replicate(100,simci()))

plot(x = sim100[,1], y = 1:100, 
  xlim = c(min(sim100),max(sim100)), type="n")
segments(y0=1:100,x0=sim100[,1],y1 = 1:100,x1=sim100[,2], 
  col = ifelse(sim100[,1]&gt;1 | sim100[,2]&lt;1,"red","black"))
abline(v=1)
&lt;/pre&gt;
&lt;br /&gt;
The result is shown at the top.  In the code, we set the limits of the x-axis by finding the max and min across the whole matrix-- this is a little wasteful of CPU cycles, but saves some typing.  The &lt;tt&gt;segments()&lt;/tt&gt; function (see &lt;a href="http://sas-and-r.blogspot.com/2010/11/example-813-bike-ride-plot-part-2.html"&gt;example 8.42&lt;/a&gt;) is a vector-enabled line-drawer.  Here we draw a line from the lower CI limit to the upper, giving the experiment number as the x value for each.  We assign a red plot line if the CI excludes 1, using the &lt;tt&gt;ifelse()&lt;/tt&gt; function (section 1.11.2), a vectorwise logic test.  Finally, a reference line helps the viewer see for far the end of the CI is from the null.  We omit prettying up the axis labels.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;
In SAS, considerably more lines are required.  We begin by simulating the data, as in &lt;a href="http://sas-and-r.blogspot.com/2009/06/example-72-simulate-data-from-logistic.html"&gt;example 7.2&lt;/a&gt;.  The modifications are to generate 100 examples with an outside &lt;tt&gt;do&lt;/tt&gt; loop (section 1.11.1) and the random element added to the variance. &lt;br /&gt;
&lt;pre&gt;data simci;
beta = 0;
intercept = 0;
do sim = 1 to 100;   /* outer loop */
  xvar = (uniform(0) *.8) + .6;  /* variance != 1 */
  do i = 1 to 1000;
    xtest = normal(0) * xvar;
    linpred = intercept + (xtest * beta);
    prob =  exp(linpred)/(1 + exp(linpred));
    ytest = (uniform(0) &amp;lt; prob);
    output;
  end;
end;
run;&lt;/pre&gt;
&lt;br /&gt;
Then we fit the logistic regression.  We leave in the &lt;tt&gt;ods trace&lt;/tt&gt; commands (section A.7.1) to remind you how to find the SAS names of the output elements, needed to save the results in the &lt;tt&gt;ods output&lt;/tt&gt; statement.  The CI for the odds ratios are requested in the &lt;tt&gt;clodds&lt;/tt&gt; statement, which accepts a &lt;tt&gt;pl&lt;/tt&gt; value for a profile likelihood based interval.&lt;br /&gt;
&lt;pre&gt;*ods trace on/listing; 
ods select none;
ods output cloddswald = lrci;
proc logistic data = simci;
by sim;
model ytest(event='1')=xtest / clodds=wald;
run;
*ods trace off;
ods select all;&lt;/pre&gt;
&lt;br /&gt;
Our plotting approach will require the "long" data set style, with two rows for each experiment.  We'll generate that while checking whether the null is excluded from the CI.&lt;br /&gt;
&lt;pre&gt;data lrp2;
set lrci;
red = 0;
if lowercl &amp;gt; 1 or uppercl &amp;lt; 1 then red = 1;
point = lowercl; output;
point = uppercl; output;
run;&lt;/pre&gt;
&lt;br /&gt;
Finally, we're ready to make the graphic.  We use the &lt;tt&gt;hilob&lt;/tt&gt; interpolation to connect the upper and lower CI for each experiment; the &lt;tt&gt;b&lt;/tt&gt; requests bars instead of a line, and the &lt;tt&gt;bwidth&lt;/tt&gt; option specifies a very narrow bar.  These options prevent the default plotting of the "mean" value with a tick.  The &lt;tt&gt;a*b=c&lt;/tt&gt; syntax (section 5.2.2) allows the different line colors.&lt;br /&gt;
&lt;pre&gt;symbol1 i=hilob bwidth=.05 c=black;
symbol2 i=hilob bwidth=.05 c=red;
proc gplot data = lrp2;
plot point * sim = red / vref = 1;
run;quit;&lt;/pre&gt;
&lt;br /&gt;
The result is just below.  The vertical alignment seen in the R plot seems more natural, but this would not be possible with the &lt;tt&gt;hilo&lt;/tt&gt; interpolation.  As theory and logic would suggest, quite a few of the hundred simulated CI exclude the null, sometimes by a large proportion of the CI width.&lt;br /&gt;
&lt;a href="http://2.bp.blogspot.com/-BNG_oubRkzI/UFNAqavHvqI/AAAAAAAADsc/PcNYJojmSJg/s1600/CIsimsSAS.jpg"&gt;&lt;img alt="" border="0" id="BLOGGER_PHOTO_ID_5788041043576864418" src="http://2.bp.blogspot.com/-BNG_oubRkzI/UFNAqavHvqI/AAAAAAAADsc/PcNYJojmSJg/s1600/CIsimsSAS.jpg" style="cursor: hand; cursor: pointer; display: block; height: 387px; margin: 0px auto 10px; text-align: center; width: 600px;" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=eGc9S_e3jIw:ZYGTQkxnmBg:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=eGc9S_e3jIw:ZYGTQkxnmBg:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=eGc9S_e3jIw:ZYGTQkxnmBg:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/eGc9S_e3jIw" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/6441609191224498915/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=6441609191224498915" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6441609191224498915?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6441609191224498915?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/eGc9S_e3jIw/example-104-multiple-comparisons-and.html" title="Example 10.4: Multiple comparisons and confidence limits" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-QU--sM2LKQw/UFNAD2qLeeI/AAAAAAAADsQ/ZqkpAL3v9Hw/s72-c/CIsimsr.jpeg" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/10/example-104-multiple-comparisons-and.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEMDSHo8eCp7ImA9WhJbF0U.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-6643829031870287640</id><published>2012-09-24T12:32:00.001-04:00</published><updated>2012-09-27T17:47:59.470-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-09-27T17:47:59.470-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="layout()" /><category scheme="http://www.blogger.com/atom/ns#" term="scatterplot" /><category scheme="http://www.blogger.com/atom/ns#" term="histogram" /><category scheme="http://www.blogger.com/atom/ns#" term="par()" /><category scheme="http://www.blogger.com/atom/ns#" term="layout.show()" /><title>Example 10.3: Enhanced scatterplot with marginal histograms</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-ETTI-zTtJzs/UFDEKMpBKxI/AAAAAAAADr0/I9riju5CZvE/s1600/withlines.jpeg"&gt;&lt;img alt="" border="0" id="BLOGGER_PHOTO_ID_5787341200642091794" src="http://4.bp.blogspot.com/-ETTI-zTtJzs/UFDEKMpBKxI/AAAAAAAADr0/I9riju5CZvE/s1600/withlines.jpeg" style="cursor: hand; cursor: pointer; display: block; height: 450px; margin: 0px auto 10px; text-align: center; width: 600px;" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
Back in &lt;a href="http://sas-and-r.blogspot.com/2011/06/example-841-scatterplot-with-marginal.html"&gt;example 8.41&lt;/a&gt; we showed how to make a graphic combining a scatterplot with histograms of each variable.  A commenter suggested we change the R graphic to allow post-hoc plotting of, for example, lowess lines.  In addition, there are further refinements to be made.&lt;br /&gt;
&lt;br /&gt;
In this R-only entry, we'll make the figure more flexible and a bit more robust.  See the example linked above for SAS code, or check out &lt;a href="http://blogs.sas.com/content/iml/2011/05/20/how-to-create-a-scatter-plot-with-marginal-histograms-in-sas/"&gt;Rick Wicklin&lt;/a&gt; discussing the same subject-- Rick gives some additional resources.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;
The R code relies heavily on the &lt;tt&gt;layout()&lt;/tt&gt; function.  We discussed it &lt;a href="http://sas-and-r.blogspot.com/2012/09/example-102-custom-graphic-layouts.html"&gt;last time&lt;/a&gt; in a simpler setting with only one column of plots.  The goal for the current plot is to enable a title for the whole figure-- this ought to be centered over the whole graphic-- and x- and y-axis labels.  In the previous version, there was no title to the page at all and the axis titles would occasionally fail.  To do this, we need a layout with a single cell at the top for the whole width of the graphic, a tall narrow cell at the left for the y-axis title, only in the bottom row, and a thin cell at the bottom, only on the left, for the x-axis title.  This turns out to be fairly simple with &lt;tt&gt;layout()&lt;/tt&gt; and the results can be checked with &lt;tt&gt;layout.show()&lt;/tt&gt;.&lt;br /&gt;
&lt;pre&gt;
zones &lt;- matrix(c(1,1,1, 
                  0,5,0, 
                  2,6,4, 
                  0,3,0), ncol = 3, byrow = TRUE)
layout(zones, widths=c(0.3,4,1), heights = c(1,3,10,.75))
layout.show(n=6)
&lt;/pre&gt;
&lt;br /&gt;
The matrix input tells R to make the whole first row a single plot area, and that this will be the first thing plotted.  The corners of the remaining 3*3 plot cells will be empty.  The numbers in the matrix give the order in which the plot cells will be filled.  This matrix is the key input to &lt;tt&gt;layout()&lt;/tt&gt;, where we use the remaining options to give the relative widths and heights of the cells.  It's possible to do this in the abstract, but is helpful to draw the intended layout first, then test whether the intended design was a achieved using the &lt;tt&gt;layout.show()&lt;/tt&gt; function.  The result is shown below.  Putting the scatterplot in last will be useful for adding to it post hoc.&lt;br /&gt;
&lt;a href="http://2.bp.blogspot.com/-rLD4Y7DbVMw/UFC56XCKkjI/AAAAAAAADqs/msvAffyRVi8/s1600/layout.jpeg"&gt;&lt;img alt="" border="0" id="BLOGGER_PHOTO_ID_5787329933437735474" src="http://2.bp.blogspot.com/-rLD4Y7DbVMw/UFC56XCKkjI/AAAAAAAADqs/msvAffyRVi8/s400/layout.jpeg" style="cursor: hand; cursor: pointer; display: block; height: 300px; margin: 0px auto 10px; text-align: center; width: 400px;" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
With that in hand, it's time to make a function.  In generating last week's example, we noted that the layout persists-- that is, the graphics area retains the layout until you shut the graphics device or restore the old parameters.  In the new plot, we'll add an option to revert to the old parameters (by default) or retain them.  The latter option would be desirable, if, as suggested by a commenter, you wanted to add items to the scatterplot after generating the plot.  We also add an option to allow different sized plot symbols. &lt;br /&gt;
&lt;pre&gt;
scatterhist &lt;- function(x, y, xlab = "", ylab = "", plottitle="", 
                        xsize=1, cleanup=TRUE,...){
  # save the old graphics settings-- they may be needed
  def.par &lt;- par(no.readonly = TRUE)
  
  zones &lt;- matrix(c(1,1,1, 0,5,0, 2,6,4, 0,3,0), ncol = 3, byrow = TRUE)
  layout(zones, widths=c(0.3,4,1), heights = c(1,3,10,.75))
  
  # tuning to plot histograms nicely
  xhist &lt;- hist(x, plot = FALSE)
  yhist &lt;- hist(y, plot = FALSE)
  top &lt;- max(c(xhist$counts, yhist$counts))
  
  # for all three titles: 
  #   drop the axis titles and omit boxes, set up margins
  par(xaxt="n", yaxt="n",bty="n",  mar = c(.3,2,.3,0) +.05)
  # fig 1 from the layout
  plot(x=1,y=1,type="n",ylim=c(-1,1), xlim=c(-1,1))
  text(0,0,paste(plottitle), cex=2)
  # fig 2
  plot(x=1,y=1,type="n",ylim=c(-1,1), xlim=c(-1,1))
  text(0,0,paste(ylab), cex=1.5, srt=90)
  # fig 3
  plot(x=1,y=1,type="n",ylim=c(-1,1), xlim=c(-1,1))
  text(0,0,paste(xlab), cex=1.5)
  
  # fig 4, the first histogram, needs different margins
  # no margin on the left
  par(mar = c(2,0,1,1))
  barplot(yhist$counts, axes = FALSE, xlim = c(0, top),
          space = 0, horiz = TRUE)
  # fig 5, other histogram needs no margin on the bottom
  par(mar = c(0,2,1,1))
  barplot(xhist$counts, axes = FALSE, ylim = c(0, top), space = 0)
  # fig 6, finally, the scatterplot-- needs regular axes, different margins
  par(mar = c(2,2,.5,.5), xaxt="s", yaxt="s", bty="n")
  # this color allows traparency &amp; overplotting-- useful if a lot of points
  plot(x, y , pch=19, col="#00000022", cex=xsize, ...)
  
  # reset the graphics, if desired 
  if(cleanup) {par(def.par)}
}
&lt;/pre&gt;
&lt;br /&gt;
To test this, we'll generate some data and try it out.  The results are immediately below; I like this example to help demonstrate that it's not the marginal normality of the data that matter.&lt;br /&gt;
&lt;pre&gt;
x=rexp(1000)
y = x^2 + rnorm(1000)
scatterhist(x[x&lt;4], y[x&lt;4], ylab="This is x", xlab="This is y",   
  "Revised scatterhist", xsize =2)
&lt;/pre&gt;
&lt;br /&gt;
&lt;a href="http://4.bp.blogspot.com/-Za7EAueb0VQ/UFDEJ45Sk7I/AAAAAAAADro/Cjuj2ja96GA/s1600/simple.jpeg"&gt;&lt;img alt="" border="0" id="BLOGGER_PHOTO_ID_5787341195341632434" src="http://4.bp.blogspot.com/-Za7EAueb0VQ/UFDEJ45Sk7I/AAAAAAAADro/Cjuj2ja96GA/s1600/simple.jpeg" style="cursor: hand; cursor: pointer; display: block; height: 450px; margin: 0px auto 10px; text-align: center; width: 600px;" /&gt;&lt;/a&gt;&lt;br /&gt;
But let's take advantage of the ability to add curves to the scatterplot.&lt;br /&gt;
&lt;pre&gt;
x=rexp(1000)
y = x^2 + rnorm(1000)
scatterhist(x[x&lt;4], y[x&lt;4], ylab="This is x", xlab="This is y",   
  "Revised scatterhist", xsize =2, cleanup=FALSE)
abline(lm(y~x))
lines(lowess(x,y))
&lt;/pre&gt;
&lt;br /&gt;
The results are shown at the top-- we can do anything with the scatterplot that we'd be able to do if there were no &lt;tt&gt;layout()&lt;/tt&gt; in effect.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=19xF6IAH3MI:1DmSazeUBs4:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=19xF6IAH3MI:1DmSazeUBs4:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=19xF6IAH3MI:1DmSazeUBs4:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/19xF6IAH3MI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/6643829031870287640/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=6643829031870287640" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6643829031870287640?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/6643829031870287640?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/19xF6IAH3MI/example-103-enhanced-scatterplot-with.html" title="Example 10.3: Enhanced scatterplot with marginal histograms" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-ETTI-zTtJzs/UFDEKMpBKxI/AAAAAAAADr0/I9riju5CZvE/s72-c/withlines.jpeg" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/09/example-103-enhanced-scatterplot-with.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0cGQX85eyp7ImA9WhJUGEQ.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-1374996933375369633</id><published>2012-09-17T11:17:00.004-04:00</published><updated>2012-09-17T11:17:00.123-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-09-17T11:17:00.123-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="layout()" /><category scheme="http://www.blogger.com/atom/ns#" term="ceil" /><category scheme="http://www.blogger.com/atom/ns#" term="custom graphics layout" /><category scheme="http://www.blogger.com/atom/ns#" term="proc sgpanel" /><category scheme="http://www.blogger.com/atom/ns#" term="graphics" /><category scheme="http://www.blogger.com/atom/ns#" term="mod" /><title>Example 10.2: Custom graphic layouts</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-4ssnJ9Ucx3o/UEevmMWuDrI/AAAAAAAADqM/1mHPI4CzD18/s1600/one%2Bhour%2Br.jpeg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 300px;" src="http://4.bp.blogspot.com/-4ssnJ9Ucx3o/UEevmMWuDrI/AAAAAAAADqM/1mHPI4CzD18/s1600/one%2Bhour%2Br.jpeg" border="0" alt=""id="BLOGGER_PHOTO_ID_5784785317067099826" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;In &lt;a href="http://sas-and-r.blogspot.com/2012/09/example-101-read-file-byte-by-byte.html"&gt;example 10.1&lt;/a&gt; we introduced data from a CPAP machine.  In brief, it's hard to tell exactly what's being recorded in the data set, but it seems to be related to the pattern of breathing.  Measurements are taken five times a second, leading to on the order of 100,000 data points in a typical night.  To get a visual sense of what a night's breathing looks like is therefore non-trivial.&lt;br /&gt;&lt;br /&gt;Today, we'll make the graphic shown above, which presents an hour of data.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;In SAS, the &lt;tt&gt;sgpanel&lt;/tt&gt; procedure (section 5.1.11) will produce a similar graphic pretty easily.  But we need to make a data set with indicators of the hour, and of ten-minute blocks within the hour.  This we'll do with the &lt;tt&gt;ceil&lt;/tt&gt; function (section 1.8.4). &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data cycles2;&lt;br /&gt;set cycles;&lt;br /&gt;hour = ceil(time_min/60);&lt;br /&gt;tenmin = ceil(time_min/10);&lt;br /&gt;time_in_ten = mod(time_min - 1/300,10);&lt;br /&gt;/* 1/300 adjustment keeps last measure in the correct &lt;br /&gt;        10-min block */&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;title "Hour 4 of pressure";&lt;br /&gt;proc sgpanel data = cycles2;&lt;br /&gt;where hour eq 4;&lt;br /&gt;panelby tenmin / layout=rowlattice rows=6 spacing = 4;&lt;br /&gt;colaxis display=none;&lt;br /&gt;rowaxis display = (nolabel);&lt;br /&gt;series x = time_in_ten y = byte;&lt;br /&gt;run; quit;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The resulting plot is shown below.  It would be nicer to omit the labels on the right of each plot, but this does not appear to be an option.  It would likely only be possible with a fair amount of effort. &lt;br /&gt;&lt;a href="http://1.bp.blogspot.com/-b5Q4Ts9n76I/UEeuTSlKkyI/AAAAAAAADp0/MeLnpf2X9m4/s1600/one%2Bhour%2BSAS.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 300px; height: 400px;" src="http://1.bp.blogspot.com/-b5Q4Ts9n76I/UEeuTSlKkyI/AAAAAAAADp0/MeLnpf2X9m4/s1600/one%2Bhour%2BSAS.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5784783892809159458" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;In R, we'll use the &lt;tt&gt;layout()&lt;/tt&gt; function to make a 7-row layout-- one for the title and 6 for the 10-minute blocks of time.  Before we get there, though, we'll construct a function to fill the time block plots with input data.  The function accepts a data vector and plots only 3,000 values from it, choosing the values based on an input hour and 10-minute block within the hour.  To ensure an equal y-axis range for each call, we'll also send minimum and maximum values as input to the function.  All of this will be fed into &lt;tt&gt;plot()&lt;/tt&gt; with the &lt;tt&gt;type="l"&lt;/tt&gt; option to make a line plot.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;plot10 = function(hour, tenmins, miny, maxy, data=cycles){&lt;br /&gt;   start = hour*18000 + tenmins* 3000 +1 &lt;br /&gt;   plot((1:3000)/300, cycles[(start + 1):(start +3000)], &lt;br /&gt;            ylim = c(miny,maxy),type="l", xaxs="i", yaxs="i")&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The documentation for &lt;tt&gt;layout()&lt;/tt&gt; is rather opaque, so we'll review it separately.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;oldpar = par(no.readonly = TRUE)&lt;br /&gt;# revert to this later &lt;br /&gt;&lt;br /&gt;layout(matrix(1:7), widths=1, heights=c(3,8,8,8,8,8,8), respect=FALSE)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The &lt;tt&gt;layout()&lt;/tt&gt; function divides the plot area into a matrix of cells, some of which will be filled by the next output plots.  The first argument says where in the matrix the next N objects will go.  All the integers 1...N must appear in the matrix; cells that will be left empty have a 0 instead.  Here, we have no empty cells, and only one column, so the "matrix" is really just a vector with 1...7 in order.  The &lt;tt&gt;widths&lt;/tt&gt; option specifies the relative widths of the columns-- here we have only one column so any constant will result in the use of the whole width of the output area.  Similarly, the &lt;tt&gt;heights&lt;/tt&gt;option gives the relative height of the cells.  Here the title will get 3/51 of the height, while each 10-minute block will get 8/51.  This unequal shape of the plot regions is one reason to prefer &lt;tt&gt;layout()&lt;/tt&gt; to some other ways to plot multiple images on a page.  The &lt;tt&gt;respect&lt;/tt&gt; option, when "TRUE" makes the otherwise relative widths and heights conform, so that a unit of height is equal to a unit of width.  We also use &lt;tt&gt;layout()&lt;/tt&gt; in &lt;a href="http://sas-and-r.blogspot.com/2011/06/example-841-scatterplot-with-marginal.html"&gt;example 8.41&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;With the layout in hand, we're ready to fill it.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;par(xaxt="n",  mar = c(.3,2,.3,0) +.05)&lt;br /&gt;# drop the x-axis, change the spacing around the plot&lt;br /&gt;plot(x=1,y=1,type="n",ylim=c(-1,1), xlim=c(-1,1), yaxt="n",bty="n")&lt;br /&gt;# the first (narrow) plot is just empty&lt;br /&gt;hour=3&lt;br /&gt;text(0,0,paste("Hour ", (hour + 1), " of pressure data"), cex=2)&lt;br /&gt;# text to put in the first plot&lt;br /&gt;miny = min(cycles[(hour * 18000 + 1):((hour + 1) * 18000)])&lt;br /&gt;maxy = max(cycles[(hour * 18000 + 1):((hour + 1) * 18000)])&lt;br /&gt;# find min and max across the whole hour, to keep range &lt;br /&gt;# of y-axis constant across the plots&lt;br /&gt;for (x in 0:5) plot10(hour, x, miny, maxy)&lt;br /&gt;# plot the 6 ten-minute blocks&lt;br /&gt;par(oldpar)&lt;br /&gt;# reset the graphics options&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The resulting plot is shown at the top of the entry.  There's clearly something odd going on around 11-15 minutes into the hour-- this could be a misadjusted mask, or a real problem with the breathing.  There's also a period around 58 minutes when it looks like breathing stops.  That's what the machine is meant to stop.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=_ImrLBd7wtI:DEvxDDpAK5Y:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=_ImrLBd7wtI:DEvxDDpAK5Y:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=_ImrLBd7wtI:DEvxDDpAK5Y:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/_ImrLBd7wtI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/1374996933375369633/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=1374996933375369633" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/1374996933375369633?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/1374996933375369633?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/_ImrLBd7wtI/example-102-custom-graphic-layouts.html" title="Example 10.2: Custom graphic layouts" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-4ssnJ9Ucx3o/UEevmMWuDrI/AAAAAAAADqM/1mHPI4CzD18/s72-c/one%2Bhour%2Br.jpeg" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/09/example-102-custom-graphic-layouts.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUMNQXszeyp7ImA9WhJUEkQ.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-8116133937688074310</id><published>2012-09-10T11:16:00.003-04:00</published><updated>2012-09-10T13:18:10.583-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-09-10T13:18:10.583-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="readBin()" /><category scheme="http://www.blogger.com/atom/ns#" term="file.info()" /><category scheme="http://www.blogger.com/atom/ns#" term="sleep apnea" /><category scheme="http://www.blogger.com/atom/ns#" term="CPAP" /><category scheme="http://www.blogger.com/atom/ns#" term="SD card" /><category scheme="http://www.blogger.com/atom/ns#" term="remainder" /><category scheme="http://www.blogger.com/atom/ns#" term="Philips" /><category scheme="http://www.blogger.com/atom/ns#" term="read data by byte" /><category scheme="http://www.blogger.com/atom/ns#" term="file()" /><category scheme="http://www.blogger.com/atom/ns#" term="mod function" /><category scheme="http://www.blogger.com/atom/ns#" term="input statement" /><title>Example 10.1: Read a file byte by byte</title><content type="html">&lt;a href="http://3.bp.blogspot.com/-V0EABxkMM-g/UEeNGHCF-oI/AAAAAAAADpg/v8lzmzPz8AI/s1600/simple%2Bplot%2BSAS.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 300px;" src="http://3.bp.blogspot.com/-V0EABxkMM-g/UEeNGHCF-oI/AAAAAAAADpg/v8lzmzPz8AI/s400/simple%2Bplot%2BSAS.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5784747382487251586" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;More and more makers of electronic devices use standard storage media to record data.  Sometimes this is central to the device's function, as in a camera, so that the data must be easy to recover.  Other times, it's effectively incidental, and the device maker may not provide easy access to the stored data.  &lt;br /&gt;&lt;br /&gt;For example, I recently was prescribed a constant positive air pressure (CPAP) machine for &lt;a href="http://en.wikipedia.org/wiki/Sleep_apnea"&gt;sleep apnea&lt;/a&gt;.  My machine, made by Philips, records data onto a SD card.  The card's file system is readable, but Philips provides neither software to read the files, nor a data dictionary to explain what they contain.  (I believe Philips sells software that does read it, for ludicrous prices, to physicians who prescribe their machines.  Nice racket.)  &lt;br /&gt;&lt;br /&gt;If you open the files on the card as ASCII files, you get a bunch of gobbledygook.  But the data they contain is &lt;b&gt;mine&lt;/b&gt;, in the most fundamental sense!  I want to be able to read it.  Fortunately, some folks have done a fair amount of work to reverse engineer the files.  Through them, I was able to find some &lt;a href="http://sourceforge.net/apps/mediawiki/onkor/index.php?title=Respironics_File_Formats"&gt;guidance&lt;/a&gt; for data from a related machine.  Now I know what's in the file, more or less: a header of 25 bytes, 1200 bytes of data-- representing 4 minutes of recording--, and a one-byte footer, repeated ad nauseum (ad somnum? somno contingit?).&lt;br /&gt;&lt;br /&gt;Today we show how to read bytes stored in a file as signed integers.  For this file, (&lt;a href="https://docs.google.com/open?id=0B_JKgZUzMYLYX2RYNEpYakJ0OVk"&gt;download&lt;/a&gt;,) we also trim out the header and footer, and make a simple line plot of the recorded data, which appear to be some function of the variable pressure with which the CPAP machine outputs air.  Next time we'll make a more useful plot.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;In SAS, we can use the &lt;tt&gt;infile&lt;/tt&gt; statement to read in the data (section 1.1.2).  &lt;br /&gt;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data test;&lt;br /&gt;infile "c:\ken\cpap\0000000007.005"  recfm=n;&lt;br /&gt;input byte ib1. @@;&lt;br /&gt;run;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The &lt;tt&gt;recfm=n&lt;/tt&gt; option tells SAS (for Windows, may differ in other OS) to read the file in binary.  The &lt;tt&gt;ib1.&lt;/tt&gt; informat tells SAS to read the bytes in the native format.  (We cover reading in various formats in section 1.1.3, A.6.4, and several examples.)  The &lt;tt&gt;@@&lt;/tt&gt; tells SAS to hold this line of input, rather than skipping to a new line, when the data is read.   (See &lt;a href="http://sas-and-r.blogspot.com/2010/07/example-81-digits-of-pi.html"&gt;Example 8.1&lt;/a&gt;) .  SAS will read bytes until there are no more to read.&lt;br /&gt;&lt;br /&gt;Now I have a file with 128,680 observations, each being a signed integer.  Some of these are actually nonsense, since the header and footer contain data stored in a variety of formats.  To get rid of the header, we'll use the &lt;t&gt;_n_&lt;/tt&gt; implied variable (section 1.4.15) which is effectively the line number, in conjunction with the &lt;tt&gt;mod&lt;/tt&gt; function.  While we're processing the data set anyhow, we'll also figure out the total elapsed time, which will be useful for plotting. &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data cycles;&lt;br /&gt;set test;&lt;br /&gt;if mod(_n_,1226) ge 25 and mod(_n_,1226) lt 1225;&lt;br /&gt;   /* otherwise it's a header or the footer */&lt;br /&gt;time_min = (4 * int(_n_/1226)) +  (mod(_n_, 1226) - 24 )/(300);&lt;br /&gt;   /* 4 minutes for each header-data-footer block + &lt;br /&gt;      number of measurements in this data block / 300 &lt;br /&gt;      (measurements per minute).&lt;br /&gt;run;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Now it's easy to plot the data-- a simple connected line plot across time makes sense, and can be made using the &lt;tt&gt;symbol&lt;/tt&gt; statement with the &lt;tt&gt;i=j&lt;/tt&gt; (j for join) syntax.  The result is shown above.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;symbol i = j v = none c = black;&lt;br /&gt;proc gplot data = cycles;&lt;br /&gt;where time_min le 4;&lt;br /&gt;plot byte * time_min;&lt;br /&gt;run;&lt;br /&gt;quit;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;In R, we'll use the &lt;tt&gt;readBin()&lt;/tt&gt; function to actually read the file, but we need to do a little prep, first.  The &lt;tt&gt;readBin()&lt;/tt&gt; function requires we input the number of data elements to read.  An overestimate is OK, but we can easily find the exact length of the file using the &lt;tt&gt;file.info&lt;/tt&gt; function; the resulting object has a &lt;tt&gt;size&lt;/tt&gt; constituent with the number of bytes.  We'll also need a "connection" to the file, which is established in a call to the &lt;tt&gt;file()&lt;/tt&gt; function.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;finfo = file.info("0000000007.005")&lt;br /&gt;toread= file("0000000007.005", "rb")&lt;br /&gt;alldata = readBin(toread, integer(), size=1, n = finfo$size, endian="little")&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The &lt;tt&gt;size&lt;/tt&gt; option is the length of the elements, in bytes, and the &lt;tt&gt;endian&lt;/tt&gt; option helps describe how the bytes should be read.&lt;br /&gt;&lt;br /&gt;Analogous to SAS, the &lt;tt&gt;alldata&lt;/tt&gt; vector has 128,680 integers. All that remains is to remove the headers and footers.  We'll do that by making a logical test with the &lt;tt&gt;%%&lt;/tt&gt; operator, saving the result as a vector, and selecting out the data from among the headers and footers using this logical.  All that then remains is to plot the data-- we replicate the SAS plot of the first 1200 observations (4 minutes).&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;keep = 1:finfo$size %% 1226 &gt; 24 &amp; 1:finfo$size %% 1226 &lt; 1225&lt;br /&gt;cycles = alldata[keep]&lt;br /&gt;# cycles gets only the elements of alldata when the corresponding&lt;br /&gt;#    element of keep is TRUE&lt;br /&gt;&lt;br /&gt;plot(1:1200, cycles[1:1200], type = "l")&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The result is shown below.  The first four minutes of my night's sleep were apparently characterized by generally lengthening breaths that became increasingly shallow.&lt;br /&gt;&lt;a href="http://1.bp.blogspot.com/-fHhg811Mewc/UEeMZUYWKMI/AAAAAAAADpE/tT8L-whp9UA/s1600/simple%2Bplot.jpeg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://1.bp.blogspot.com/-fHhg811Mewc/UEeMZUYWKMI/AAAAAAAADpE/tT8L-whp9UA/s1600/simple%2Bplot.jpeg" border="0" alt=""id="BLOGGER_PHOTO_ID_5784746612976134338" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=gz86k6R2mBg:FdKq4CEDs-E:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=gz86k6R2mBg:FdKq4CEDs-E:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=gz86k6R2mBg:FdKq4CEDs-E:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/gz86k6R2mBg" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/8116133937688074310/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=8116133937688074310" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8116133937688074310?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8116133937688074310?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/gz86k6R2mBg/example-101-read-file-byte-by-byte.html" title="Example 10.1: Read a file byte by byte" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-V0EABxkMM-g/UEeNGHCF-oI/AAAAAAAADpg/v8lzmzPz8AI/s72-c/simple%2Bplot%2BSAS.png" height="72" width="72" /><thr:total>1</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/09/example-101-read-file-byte-by-byte.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0ENRn4ycSp7ImA9WhJWEEk.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-8118742511071750327</id><published>2012-08-14T20:33:00.002-04:00</published><updated>2012-08-15T10:41:37.099-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-08-15T10:41:37.099-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mosaic package" /><category scheme="http://www.blogger.com/atom/ns#" term="Statistical Sleuth" /><category scheme="http://www.blogger.com/atom/ns#" term="teaching statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="Project MOSAIC" /><title>The Statistical Sleuth (second edition) in R</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-AyOI695ry1M/UCu1FUYBmsI/AAAAAAAAAKc/BhlTyk-Sfog/s1600/Rplot01.jpeg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 231px;" src="http://4.bp.blogspot.com/-AyOI695ry1M/UCu1FUYBmsI/AAAAAAAAAKc/BhlTyk-Sfog/s400/Rplot01.jpeg" border="0" alt=""id="BLOGGER_PHOTO_ID_5776910050006244034" /&gt;&lt;/a&gt;&lt;br /&gt;For those of you who teach, or are interested in seeing an illustrated series of analyses, there is a new compendium of  files to help describe how to fit models for the extended case studies in the Second Edition of the &lt;a href="http://www.proaxis.com/~panorama/home.htm"&gt;Statistical Sleuth: A Course in Methods of Data Analysis&lt;/a&gt; (2002), the excellent text by Fred Ramsey and Dan Schafer.  If you are using this book, or would like to see straightforward ways to undertake analyses in R for intro and intermediate statistics courses, these may be of interest.&lt;br /&gt;&lt;br /&gt;These files can be found &lt;a href="http://www.math.smith.edu/~nhorton/sleuth"&gt;here&lt;/a&gt;. The site includes both formatted pdf files as well as the original knitr files which were used to generate the output. &lt;a href="http://yihui.name/knitr/"&gt;Knitr&lt;/a&gt; is an elegant, flexible and fast means to undertake reproducible analysis and dynamic report generation within R and &lt;a href="http://rstudio.org"&gt;RStudio&lt;/a&gt;.  &lt;br /&gt;&lt;br /&gt;This work leverages efforts undertaken by &lt;a href="http://www.mosaic-web.org"&gt;Project MOSAIC&lt;/a&gt;, an NSF-funded initiative to improve the teaching of statistics, calculus, science and computing in the undergraduate curriculum. In particular, we utilize the &lt;a href="cran.r-project.org/web/packages/mosaic"&gt;mosaic package&lt;/a&gt;, which was written to simplify the use of R for introductory statistics courses.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wNNxPhwQNnQ:1OtnDXoscdA:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wNNxPhwQNnQ:1OtnDXoscdA:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wNNxPhwQNnQ:1OtnDXoscdA:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/wNNxPhwQNnQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/8118742511071750327/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=8118742511071750327" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8118742511071750327?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8118742511071750327?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/wNNxPhwQNnQ/the-statistical-sleuth-second-edition.html" title="The Statistical Sleuth (second edition) in R" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-AyOI695ry1M/UCu1FUYBmsI/AAAAAAAAAKc/BhlTyk-Sfog/s72-c/Rplot01.jpeg" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/08/the-statistical-sleuth-second-edition.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0cEQXozfCp7ImA9WhJQEEk.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-2679397732533698484</id><published>2012-07-23T09:10:00.002-04:00</published><updated>2012-07-23T09:10:00.484-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-23T09:10:00.484-04:00</app:edited><title>Third year wrap-up</title><content type="html">July marks the end of three years of blogging for us.  By our count, we've posted 121 examples across the first three years.  We aim to be helpful and interesting.&lt;br /&gt;&lt;br /&gt;As always, it's hard to get a sense of our readership. At the time we wrote this, Feedburner reports about 1050 regular readers (up from 650 last year), but this (still) omits people who see us on R-bloggers or SAS Community Planet or SAS-X or statsblogs. As consumers of those aggregators, we assume there are many others who see us without subscribing directly.  Google Analytics reports over 200,000 total pageviews (up from 100,000), while Feedburner claims 525,000, (up from 250,000). &lt;br /&gt;&lt;br /&gt;As in previous years (&lt;a href="http://sas-and-r.blogspot.com/2010/06/second-year-of-entries.html"&gt;2010&lt;/a&gt;, &lt;a href="http://sas-and-r.blogspot.com/2011/07/third-year-of-entries.html"&gt;2011&lt;/a&gt;) we report here on our most popular entries:&lt;br /&gt;&lt;br /&gt;Feedburner&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2012/02/rstudio-in-cloud-for-dummies.html"&gt;RStudio in the cloud, for dummies&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2011/05/to-attach-or-not-attach-that-is.html"&gt;To attach() or not attach(): that is the question&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2011/12/example-917-much-better-pairs-plots.html"&gt;Example 9.17: (much) better pairs plot&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2011/07/really-useful-r-package-sas7bdat.html"&gt;Really useful R package: sas7bdat&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2011/05/example-837-read-sheets-from-excel-file.html"&gt;Example 8.37: Read sheets from Excel&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2012/02/example-920-visualizing-simpsons.html"&gt;Example 9.20: Visualizing Simpson's paradox&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Blogger&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2010/05/example-735-propensity-score-matchingn.html"&gt;Example 7.35: Propensity score matching&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2010/09/example-87-hosmer-and-lemeshow-goodness.html"&gt;Example 8.7: Hosmer and Lemeshow goodness-of-fit&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2010/03/example-730-simulate-censored-survival.html"&gt;Example 7.30: Simulate censored survival data&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2010/05/example-738-kaplan-meier-survival.html"&gt;Example 7.38: Kaplan-Meier survival estimates&lt;/a&gt;&lt;br /&gt;&lt;a href="http://sas-and-r.blogspot.com/2009/06/example-72-simulate-data-from-logistic.html"&gt;Example 7.2: Simulate data from a logistic regression&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;No overlap at all!  This points to the difficulty of knowing what kinds of things we do may be useful to you, our readers.  So, as usual, any feedback or suggestions would be most welcome.&lt;br /&gt;&lt;br /&gt;In previous years we've slavishly turned over into a new set of numbered entries on our anniversary (July 1) and then taken a hiatus in August.  This year we're going to rationalize and both conclude the chapter and take our break with this entry.  We'll be back with example 10.1 in September.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;pre&gt;&lt;/pre&gt;&lt;b&gt;R&lt;/b&gt;&lt;pre&gt;&lt;/pre&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=P9vagqegtrg:RfWVwnyNX1M:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=P9vagqegtrg:RfWVwnyNX1M:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=P9vagqegtrg:RfWVwnyNX1M:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/P9vagqegtrg" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/2679397732533698484/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=2679397732533698484" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2679397732533698484?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2679397732533698484?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/P9vagqegtrg/third-year-wrap-up.html" title="Third year wrap-up" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/07/third-year-wrap-up.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkYASX46fCp7ImA9WhJRFk8.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-1799902229323852905</id><published>2012-07-16T13:17:00.014-04:00</published><updated>2012-07-18T10:02:28.014-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-18T10:02:28.014-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mosaic package" /><category scheme="http://www.blogger.com/atom/ns#" term="grid.polyline() function" /><category scheme="http://www.blogger.com/atom/ns#" term="graphics" /><category scheme="http://www.blogger.com/atom/ns#" term="confidence intervals" /><category scheme="http://www.blogger.com/atom/ns#" term="mapply() function" /><category scheme="http://www.blogger.com/atom/ns#" term="grid.text() function" /><category scheme="http://www.blogger.com/atom/ns#" term="barchart() function" /><category scheme="http://www.blogger.com/atom/ns#" term="lattice library" /><title>Example 9.38: dynamite plots, revisited</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-wRVwvhKiJL0/T-idN9WOPvI/AAAAAAAAAJo/Z_Vh_NM7MlA/s1600/dynamite.jpeg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 231px;" src="http://4.bp.blogspot.com/-wRVwvhKiJL0/T-idN9WOPvI/AAAAAAAAAJo/Z_Vh_NM7MlA/s400/dynamite.jpeg" border="0" alt=""id="BLOGGER_PHOTO_ID_5758024986724810482" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;a href="http://emdbolker.wikidot.com/blog:dynamite"&gt;Dynamite plots&lt;/a&gt; are a somewhat pejorative term for a graphical display where the height of a bar indicates the mean, and the vertical line on top of it represents the standard deviation (or standard error). These displays are commonly found in many scientific disciplines, as a way of communicating group differences in means. &lt;br /&gt;&lt;br /&gt;Many find these displays troubling.  One post entitled them &lt;a href="http://emdbolker.wikidot.com/blog%3Adynamite"&gt;unmitigated evil&lt;/a&gt;.  &lt;br /&gt;The Vanderbilt University Department of Biostatistics has a formal &lt;a href="http://biostat.mc.vanderbilt.edu/wiki/Main/StatisticalPolicy"&gt;policy&lt;/a&gt; discouraing use of these plots, stating that:&lt;br /&gt;&lt;i&gt;&lt;br /&gt;Dynamite plots often hide important information. This is particularly true of small or skewed data sets. Researchers are highly discouraged from using them, and department members have the option to decline participation in papers in which the lead author requires the use of these plots. &lt;br /&gt;&lt;/i&gt;&lt;br /&gt;Despite the limitations of the display, we believe that there may be times when the display is helpful as a way to compare groups, assuming distributions that are approximately normal.   &lt;a href="http://www.r-bloggers.com/dynamite-plots-in-r/"&gt;Samuel Brown&lt;/a&gt; also described creation of these figures, as a way to encourage computing in R.  We &lt;a href="http://sas-and-r.blogspot.com/2011/11/example-915-bar-chart-with-error-bars.html"&gt;previously&lt;/a&gt; demonstrated how to create them in SAS and R, and today discuss code created by &lt;a href="http://www.calvin.edu/~rpruim/"&gt;Randall Pruim&lt;/a&gt; to demonstrate how such graphics can be created using &lt;a href="http://stat.ethz.ch/R-manual/R-devel/library/lattice/html/Lattice.html"&gt;lattice&lt;/a&gt; graphics within the &lt;tt&gt;mosaic&lt;/tt&gt; package.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;pre&gt;&lt;br /&gt;library(mosaic)&lt;br /&gt;dynamitePlot &lt;- function(height, error, names = as.character(1:length(height)), &lt;br /&gt;                         significance = NA, ylim = c(0, maxLim), ...) {&lt;br /&gt;  if (missing(error)) { error = 0 }&lt;br /&gt;  maxLim &lt;- 1.2* max(mapply(sum, height, error))&lt;br /&gt;  mError &lt;- min(c(error, na.rm=TRUE))&lt;br /&gt;  barchart(height ~ names, ylim=ylim, panel=function(x,y,...) {&lt;br /&gt;    panel.barchart(x,y,...)&lt;br /&gt;    grid.polyline(c(x,x), c(y, y+error), id=rep(x,2), default.units='native',&lt;br /&gt;      arrow=arrow(angle=45, length=unit(mError, 'native'))) &lt;br /&gt;    grid.polyline(c(x,x), c(y, y-error), id=rep(x,2), default.units='native',&lt;br /&gt;      arrow=arrow(angle=45, length=unit(mError, 'native'))) &lt;br /&gt;    grid.text(x=x, y=y + error + .05*maxLim, label=significance, &lt;br /&gt;      default.units='native')&lt;br /&gt;  }, ...)&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Much of the code involves setting up the appropriate axis limits, then drawing the lines and adding the text.  We can call this new function with an artificial example with 4 groups:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;Values &lt;- c(1,2,5,4)&lt;br /&gt;Errors &lt;- c(0.25, 0.5, 0.33, 0.12)&lt;br /&gt;Names &lt;- paste("Trial", 1:4)&lt;br /&gt;Sig &lt;- c("a", "a", "b", "b")&lt;br /&gt;dynamitePlot(Values, Errors, names=Names, significance=Sig)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;We still don't recommend frequent use of these plots (as other displays may be better (e.g. dotplots for very small sample sizes or &lt;a href="http://sas-and-r.blogspot.com/2010/10/example-811-violin-plots.html"&gt;violin plots&lt;/a&gt;), but having the capability to generate dynamite plots within the lattice framework can be handy.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=pSP590VAHXs:upKhFIg0cWI:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=pSP590VAHXs:upKhFIg0cWI:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=pSP590VAHXs:upKhFIg0cWI:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/pSP590VAHXs" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/1799902229323852905/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=1799902229323852905" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/1799902229323852905?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/1799902229323852905?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/pSP590VAHXs/example-938-dynamite-plots-revisited.html" title="Example 9.38: dynamite plots, revisited" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-wRVwvhKiJL0/T-idN9WOPvI/AAAAAAAAAJo/Z_Vh_NM7MlA/s72-c/dynamite.jpeg" height="72" width="72" /><thr:total>4</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/07/example-938-dynamite-plots-revisited.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CUMAQX87cSp7ImA9WhJSGEg.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-4396818769536960584</id><published>2012-07-09T13:04:00.028-04:00</published><updated>2012-07-09T13:04:00.109-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-09T13:04:00.109-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="binom.test()" /><category scheme="http://www.blogger.com/atom/ns#" term="qt()" /><category scheme="http://www.blogger.com/atom/ns#" term="%sysevalf" /><category scheme="http://www.blogger.com/atom/ns#" term="pdf function" /><category scheme="http://www.blogger.com/atom/ns#" term="Clopper-Pearson CI" /><category scheme="http://www.blogger.com/atom/ns#" term="quantile function" /><category scheme="http://www.blogger.com/atom/ns#" term="proc gplot" /><category scheme="http://www.blogger.com/atom/ns#" term="prop.test()" /><category scheme="http://www.blogger.com/atom/ns#" term="coverage probabilities" /><category scheme="http://www.blogger.com/atom/ns#" term="Wilson estimator" /><category scheme="http://www.blogger.com/atom/ns#" term="Plus 4 estimator" /><category scheme="http://www.blogger.com/atom/ns#" term="mode=include" /><title>Example 9.37: (Mis)behavior of binomial confidence intervals</title><content type="html">&lt;a href="http://4.bp.blogspot.com/-RmyBhgCuTqA/T_mxwWI_rdI/AAAAAAAAAKI/vVwUI87sOjs/s1600/Rplot.jpeg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 231px;" src="http://4.bp.blogspot.com/-RmyBhgCuTqA/T_mxwWI_rdI/AAAAAAAAAKI/vVwUI87sOjs/s400/Rplot.jpeg" border="0" alt=""id="BLOGGER_PHOTO_ID_5762832642332405202" /&gt;&lt;/a&gt;&lt;br /&gt;While traditional statistics courses teach students to calculate intervals and test for &lt;a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval"&gt;binomial proportions&lt;/a&gt; using a normal or t approximation, this method does not always work well.  &lt;a href="http://www.stat.ufl.edu/~aa/"&gt;Agresti&lt;/a&gt; and &lt;a href="http://www.hsph.harvard.edu/faculty/brent-coull/publications/"&gt;Coull&lt;/a&gt; ("Approximate is better than "exact' for interval estimation of binomial proportions". &lt;i&gt;The American Statistician&lt;/i&gt;, 1998; 52:119-126) demonstrated this and reintroduced an improved (Plus4) estimator originally due to Wilson (1927).  &lt;br /&gt;&lt;br /&gt;In this entry, we demonstrate how the coverage varies as a function of the underlying probability and compare four intervals: (1) t interval, (2) Clopper-Pearson, (3) Plus4 (Wilson/Agresti/Coull) and (4) Score, using code contributed by &lt;a href="http://people.reed.edu/~jones/"&gt;Albyn Jones&lt;/a&gt; from &lt;a href="http://www.reed.edu"&gt;Reed College&lt;/a&gt;.  Here, coverage probability is defined as the expected value that a CI based on an observed value will cover the true binomial parameter that generated that value.  The code calculates the coverage probability as a function of a given binomial probability &lt;tt&gt;p&lt;/tt&gt; and a sample size &lt;tt&gt;n&lt;/tt&gt;.  Intervals are created for each of the possible outcomes from 0, ..., n, then checked to see if the intervals include the true value.  Finally, the sum of the probabilities of observing outcomes in which the binomial parameter is included in the interval determines the exact coverage.  Note several distinct probabilities: (1) binomial parameter "p", the probability of success on a trial; (2) probability of observing x successes in N trials, P(X=x); (3) coverage probability as defined above.  For distribution quantiles and probabilities, see section 1.10 and table 1.1.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;pre&gt;&lt;/pre&gt;&lt;br /&gt;We begin by defining the support functions which will be used to calculate the coverage probabilities for a specific probability and sample size.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;CICoverage = function(n, p, alpha=.05) {&lt;br /&gt;  # set up a table to hold the results&lt;br /&gt;  Cover = matrix(0, nrow=n+1, ncol=5)&lt;br /&gt;  colnames(Cover)=c("X","ClopperPearson","Plus4","Score","T")&lt;br /&gt;  Cover[,1]=0:n&lt;br /&gt;  zq = qnorm(1-alpha/2)&lt;br /&gt;  tq = qt(1-alpha/2,n-1)&lt;br /&gt;  for (i in 0:n) {&lt;br /&gt;    Phat = i/n&lt;br /&gt;    P4 = (i+2)/(n+4)&lt;br /&gt;    # Calculate T and plus4 intervals manually, &lt;br /&gt;    # use canned functions for the other &lt;br /&gt;    TInt = Phat + c(-1,1)*tq*sqrt(Phat*(1-Phat)/n)&lt;br /&gt;    P4Int = P4 + c(-1,1)*zq*sqrt(P4*(1-P4)/(n+4))&lt;br /&gt;    CPInt= binom.test(i,n)$conf.int&lt;br /&gt;    SInt = prop.test(i,n)$conf.int&lt;br /&gt;    # check to see if the binomial p is in each CI &lt;br /&gt;    Cover[i+1,2] = InInt(p, CPInt)&lt;br /&gt;    Cover[i+1,3] = InInt(p, P4Int)&lt;br /&gt;    Cover[i+1,4] = InInt(p, SInt)&lt;br /&gt;    Cover[i+1,5] = InInt(p, TInt)&lt;br /&gt;  }&lt;br /&gt;  # probability that X=x &lt;br /&gt;  p = dbinom(0:n, n, p)&lt;br /&gt;  ProbCover=rep(0, 4)&lt;br /&gt;  names(ProbCover) = c("ClopperPearson", "Plus4", "Score", "T")&lt;br /&gt;  # sum P(X=x) * I(p in CI from x)&lt;br /&gt;  for (i in 1:4){&lt;br /&gt;    ProbCover[i] = sum(p*Cover[,i+1])&lt;br /&gt;  }&lt;br /&gt;  list(n=n, p=p, Cover=Cover, PC=ProbCover)&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;In addition, we define a function to determine whether something is in the interval.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;InInt = function(p,interval){&lt;br /&gt;  interval[1] &lt;= p &amp;&amp; interval[2] &gt;= p&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Finally, there's a function which summarizes the results.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;CISummary = function(n, p) {&lt;br /&gt;  M = matrix(0,nrow=length(n)*length(p),ncol=6)&lt;br /&gt;  colnames(M) = c("n","p","ClopperPearson","Plus4","Score","T")&lt;br /&gt;  k=0&lt;br /&gt;  for (N in n) {&lt;br /&gt;    for (P in p) {&lt;br /&gt;      k=k+1&lt;br /&gt;      M[k,]=c(N, P, CICoverage(N, P)$PC)&lt;br /&gt;    }&lt;br /&gt;  }&lt;br /&gt;  data.frame(M)&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;We then generate the CI coverage plot provided at the start of the entry, which uses sample size n=50 across a variety of probabilities.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;lwdval = 2&lt;br /&gt;nvals = 50&lt;br /&gt;probvals = seq(.01, .30, by=.001)&lt;br /&gt;results = CISummary(nvals, probvals)&lt;br /&gt;plot(range(probvals), c(0.85, 1), type="n", xlab="true binomial p",&lt;br /&gt;     ylab="coverage probability")&lt;br /&gt;abline(h=0.95, lty=2)&lt;br /&gt;lines(results$p, results$ClopperPearson, col=1, lwd=lwdval)&lt;br /&gt;lines(results$p, results$Plus4, col=2, lwd=lwdval)&lt;br /&gt;lines(results$p, results$Score, col=3, lwd=lwdval)&lt;br /&gt;lines(results$p, results$T, col=4, lwd=lwdval)&lt;br /&gt;tests = c("ClopperPearson", "Plus4", "Score", "T")&lt;br /&gt;legend("bottomright", legend=tests,&lt;br /&gt;       col=1:4, lwd=lwdval, cex=0.70)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The resulting plot is quite interesting, and demonstrates how non-linear the coverage is for these methods, and how the t (almost equivalent to the normal, in this case) is anti-conservative in many cases.  It also confirms the results of Agresti and Coull, who concluded that for interval estimation of a proportion, coverage probabilities from inverting the standard binomial and too small when inverting the Wald large-sample normal test, with the Plus 4 yielding coverage probabilities close to the desired, even for very small sample sizes.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;Calculating the coverage probability for a given N and binomial p can be done in a single data step, summing the probability-weighted coverage indicators over the realized values of the random variate.  Once this machinery is developed, we can call it repeatedly, using a macro, to find the results for different binomial p.  We comment on the code internally.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;%macro onep(n=,p=,alpha=.05);&lt;br /&gt;data onep;&lt;br /&gt;n = &amp;n;&lt;br /&gt;p = &amp;p;&lt;br /&gt;alpha = &amp;alpha;&lt;br /&gt;/* set up collectors of the weighted coverage indicators */&lt;br /&gt;expcontrib_t = 0;&lt;br /&gt;expcontrib_p4 = 0;&lt;br /&gt;expcontrib_s = 0;&lt;br /&gt;expcontrib_cp = 0;&lt;br /&gt;/* loop through the possible observed successes x*/&lt;br /&gt;do x = 0 to n;&lt;br /&gt;  probobs = pdf('BINOM',x,p,n);  /* probability X=x */&lt;br /&gt;  phat = x/n;&lt;br /&gt;  zquant = quantile('NORMAl', 1 - alpha/2, 0, 1);&lt;br /&gt;  p4 = (x+2)/(n + 4);&lt;br /&gt;&lt;br /&gt;  /* calculate the half-width of the t and plus4 intervals */&lt;br /&gt;  thalf = quantile('T', 1 - alpha/2,(n-1)) * sqrt(phat*(1-phat)/n);&lt;br /&gt;  p4half = zquant * sqrt(p4*(1-p4)/(n+4));&lt;br /&gt;&lt;br /&gt;  /* the score CI in R uses a Yates correction by default, and is &lt;br /&gt;     reproduced here */&lt;br /&gt;  yates = min(0.5, abs(x - (n*p)));&lt;br /&gt;  z22n = (zquant**2)/(2*n);&lt;br /&gt;  yl = phat-yates/n;&lt;br /&gt;  yu = phat+yates/n;&lt;br /&gt;  slower = (yl + z22n - zquant * sqrt( (yl*(1-yl)/n) + z22n / (2*n) )) /&lt;br /&gt;    (1 + 2 * z22n); &lt;br /&gt;  supper = (yu + z22n + zquant * sqrt( (yu*(1-yu)/n) + z22n / (2*n) )) /&lt;br /&gt;    (1 + 2 * z22n); &lt;br /&gt;&lt;br /&gt;  /* cover = 1 if in the CI, 0 else */&lt;br /&gt;  cover_t = ((phat - thalf) &lt; p) and ((phat + thalf) &gt; p);&lt;br /&gt;  cover_p4 = ((p4 - p4half) &lt; p) and ((p4 + p4half) &gt; p);&lt;br /&gt;  cover_s = (slower &lt; p) and (supper &gt; p);&lt;br /&gt;  /* the Clopper-Pearson interval can be easily calculated on the fly */ &lt;br /&gt;  cover_cp = (quantile('BETA', alpha/2 ,x,n-x+1) &lt; p) and &lt;br /&gt;    (quantile('BETA', 1 - alpha/2 ,x+1,n-x) &gt; p); &lt;br /&gt;&lt;br /&gt;  /* cumulate the weighted probabilities */&lt;br /&gt;  expcontrib_t = expcontrib_t + probobs * cover_t;&lt;br /&gt;  expcontrib_p4 = expcontrib_p4 + probobs * cover_p4;&lt;br /&gt;  expcontrib_s = expcontrib_s + probobs * cover_s;&lt;br /&gt;  expcontrib_cp = expcontrib_cp + probobs * cover_cp;&lt;br /&gt;  /* only save the last interation */&lt;br /&gt;  if x = N then output;&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;%mend onep;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The following macro calls the first one for a series of binomial p for a fixed N.  Since the macro &lt;tt&gt;%do&lt;/tt&gt; loop can only iterate through integers, we have to do a little division; the &lt;tt&gt;%sysevelf&lt;/tt&gt; function will do this within the macro.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;%macro repcicov(n=, lop=, hip=, byp=, alpha= .05);&lt;br /&gt;/* need an empty data set to store the results */&lt;br /&gt;data summ; set _null_; run;&lt;br /&gt;%do stepp = %sysevalf(&amp;lop / &amp;byp, floor) %to %sysevalf(&amp;hip / &amp;byp,floor);&lt;br /&gt;  /* note that the p sent to the %onep macro is a &lt;br /&gt;                           text string like "49 * .001" */&lt;br /&gt;  %onep(n = &amp;n, p = &amp;stepp * &amp;byp, alpha = &amp;alpha);&lt;br /&gt;  /* tack on the current results to the ones finished so far */&lt;br /&gt;  /* this is a simple but inefficient way to add each binomial p into &lt;br /&gt;     the output data set */&lt;br /&gt;  data summ; set summ onep; run;&lt;br /&gt;%end;&lt;br /&gt;%mend repcicov;&lt;br /&gt;&lt;br /&gt;/* same parameters as in R */&lt;br /&gt;%repcicov(n=50, lop = .01, hip = .3, byp = .001);&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Finally, we can plot the results.  One option shown here and not mentioned in the book are the &lt;tt&gt;mode=include&lt;/tt&gt; option to the &lt;tt&gt;symbol&lt;/tt&gt; statement, which allows the two distinct pieces of the T coverage to display correctly.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;goptions reset=all;&lt;br /&gt;legend1 label=none position=(bottom right inside)&lt;br /&gt;        mode=share across=1 frame value = (h=2);&lt;br /&gt;axis1 order = (0.85 to 1 by 0.05) minor=none &lt;br /&gt;   label = (a=90 h=2 "Coverage probability") value=(h=2);&lt;br /&gt;axis2 order = (0 to 0.3 by 0.05) minor=none &lt;br /&gt;   label = (h=2 "True binomial p") value=(h=2);&lt;br /&gt;symbol1 i = j v = none l =1 w=3 c=blue mode=include;&lt;br /&gt;symbol2 i = j v = none l =1 w=3 c=red;&lt;br /&gt;symbol3 i = j v = none l =1 w=3 c=lightgreen;&lt;br /&gt;symbol4 i = j v = none l =1 w=3 c=black;&lt;br /&gt;proc gplot data = summ;&lt;br /&gt;plot (expcontrib_t expcontrib_p4  expcontrib_s  expcontrib_cp) * p &lt;br /&gt;  / overlay legend vaxis = axis1 haxis = axis2 vref = 0.95 legend = legend1;&lt;br /&gt;label expcontrib_t = "T approximation" expcontrib_p4 = "P4 method"&lt;br /&gt;      expcontrib_s = "Score method" expcontrib_cp = "Exact (CP)";&lt;br /&gt;run; quit;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://2.bp.blogspot.com/-2OzkVp8wC6E/T_Y9mBr5BZI/AAAAAAAADos/aF2ZzvQU9m4/s1600/coverage%2BSAS.jpg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 800px; height: 516px;" src="http://2.bp.blogspot.com/-2OzkVp8wC6E/T_Y9mBr5BZI/AAAAAAAADos/aF2ZzvQU9m4/s1600/coverage%2BSAS.jpg" border="0" alt=""id="BLOGGER_PHOTO_ID_5761860496763979154" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=ieJZtph0_f0:_E4MspI2pC8:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ieJZtph0_f0:_E4MspI2pC8:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=ieJZtph0_f0:_E4MspI2pC8:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/ieJZtph0_f0" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/4396818769536960584/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=4396818769536960584" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/4396818769536960584?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/4396818769536960584?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/ieJZtph0_f0/example-937-misbehavior-of-binomial.html" title="Example 9.37: (Mis)behavior of binomial confidence intervals" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-RmyBhgCuTqA/T_mxwWI_rdI/AAAAAAAAAKI/vVwUI87sOjs/s72-c/Rplot.jpeg" height="72" width="72" /><thr:total>4</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/07/example-937-misbehavior-of-binomial.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D04AQXg8fyp7ImA9WhJSEkk.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-2831117787343568727</id><published>2012-07-02T12:19:00.005-04:00</published><updated>2012-07-02T12:19:00.677-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-02T12:19:00.677-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="citation()" /><category scheme="http://www.blogger.com/atom/ns#" term="citing R" /><category scheme="http://www.blogger.com/atom/ns#" term="citing SAS" /><title>Citing R or SAS</title><content type="html">One of us recently read a colleague's first draft of a paper, in which she had written: "All analyses were done in R 2.14.0."  We assume we're preaching to the converted here, when we say that the enormous amount of work that goes into R needs to be recognized as often as possible, and that R's creators deserve to reap some credit for their labors.  In contrast to SAS, after all, most work on R is not compensated with a paycheck.  As a reminder, the &lt;tt&gt;citation()&lt;/tt&gt; function produces the correct citation for R in general and is good to use when citing R.&lt;br /&gt;&lt;br /&gt;The project in question had used a negative binomial regression function from the MASS package, but colleague had omitted any reference to it.  In this case, a citation would provide both credit to the authors and a useful guide to anyone wanting to replicate our approach.  It would also allow readers to consider whether changes in the package might affect the results observed.  A call to  &lt;tt&gt;citation(package="MASS")&lt;/tt&gt; will provide the preferred citation here.  (Any package name can be inserted, or course, though some authors may not have provided a full citation.)&lt;br /&gt;&lt;br /&gt;Similarly, while SAS authors are rarely identified by name and presumably get a salary from SAS, it's preferable to identify the version of the software and where it can be obtained.  In medical research this is usually done by an in-text reference.  For example: "Analyses were performed in SAS 9.3 (SAS Institute, Cary NC)."&lt;br /&gt;&lt;br /&gt;For complex analyses, it is also best to mention the SAS procedure used.  As with the R package, this can help readers plan similar analyses, and may inform interpretation.&lt;br /&gt;&lt;br /&gt;So a multi-software analysis section might end with the following statement: Analyses were performed in R 2.14.2 [1] using the MASS package [2] &lt;tt&gt;glm.nb()&lt;/tt&gt; function for negative binomial regression and in SAS 9.3 (SAS Institute, Cary NC) using the MCMC procedure for negative binomial mixture models."  The references to [1] and [2] would be found using the &lt;tt&gt;citation()&lt;/tt&gt; function.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=cmVO8Pe4YW8:1t6Y8_ah00g:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=cmVO8Pe4YW8:1t6Y8_ah00g:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=cmVO8Pe4YW8:1t6Y8_ah00g:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/cmVO8Pe4YW8" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/2831117787343568727/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=2831117787343568727" title="3 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2831117787343568727?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2831117787343568727?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/cmVO8Pe4YW8/citing-r-or-sas.html" title="Citing R or SAS" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>3</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/07/citing-r-or-sas.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0QGQX85fyp7ImA9WhJTFk8.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-3900673155530590030</id><published>2012-06-25T09:02:00.004-04:00</published><updated>2012-06-25T09:02:00.127-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-06-25T09:02:00.127-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Levene's test" /><category scheme="http://www.blogger.com/atom/ns#" term="homoscedasticity" /><category scheme="http://www.blogger.com/atom/ns#" term="lawstat package" /><category scheme="http://www.blogger.com/atom/ns#" term="proc glm" /><category scheme="http://www.blogger.com/atom/ns#" term="levene.test() function" /><title>Example 9.36: Levene's test for equal variances</title><content type="html">The assumption of equal variances among the groups in analysis of variance is an expression of the assumption of homoscedasticity for linear models more generally.  For ANOVA, this assumption can be tested via &lt;a href="http://en.wikipedia.org/wiki/Levene%27s_test"&gt;Levene's test&lt;/a&gt;.  The test is a function of the residuals and means within each group, though various modifications are used, including the Brown-Forsythe test.  This uses the medians within group, rather than the mean, and is recommended when normality may be suspect.&lt;br /&gt;&lt;br /&gt;We illustrate using the HELP data set available &lt;a href="http://www.math.smith.edu/sasr/datasets.php"&gt;here&lt;/a&gt;, modeling depressive symptoms (assessed via CESD) as a function of abused substance.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;In SAS, the tests are available as an option to the &lt;tt&gt;means&lt;/tt&gt; statement in &lt;tt&gt;proc glm&lt;/tt&gt;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data help;&lt;br /&gt;set "C:\book\help.sas7bdat";&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc glm data = help;&lt;br /&gt;class substance;&lt;br /&gt;model cesd = substance;&lt;br /&gt;means substance / hovtest=levene(type=abs) hovtest=bf;&lt;br /&gt;run;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The two requested tests are a version of Levene's test that is produced in R, below, and the aforementioned Brown-Forsythe test.  The relevant results are shown below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;           Levene's Test for Homogeneity of CESD Variance&lt;br /&gt;           ANOVA of Absolute Deviations from Group Means&lt;br /&gt;&lt;br /&gt;                           Sum of        Mean&lt;br /&gt;  Source           DF     Squares      Square    F Value    Pr &gt; F&lt;br /&gt;&lt;br /&gt;  SUBSTANCE         2       272.4       136.2       2.61    0.0747&lt;br /&gt;  Error           450     23480.7     52.1793&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;     Brown and Forsythe's Test for Homogeneity of CESD Variance&lt;br /&gt;          ANOVA of Absolute Deviations from Group Medians&lt;br /&gt;&lt;br /&gt;                           Sum of        Mean&lt;br /&gt;  Source           DF     Squares      Square    F Value    Pr &gt; F&lt;br /&gt;&lt;br /&gt;  SUBSTANCE         2       266.0       133.0       2.46    0.0864&lt;br /&gt;  Error           450     24310.9     54.0243&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;There's some suggestion of a lack of homoscedasticity; it might be wise to consider methods robust to violations of this assumption.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;In R, the test can be found in the &lt;tt&gt;levene.test()&lt;/tt&gt; function in the lawstat package.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;help = read.csv("http://www.math.smith.edu/r/data/help.csv")&lt;br /&gt;library(lawstat)&lt;br /&gt;with(help, levene.test(cesd, as.factor(substance), location="mean"))&lt;br /&gt;&lt;br /&gt; classical Levene's test based on the absolute deviations from the mean &lt;br /&gt;          ( none not applied because the location is not set to median )&lt;br /&gt;&lt;br /&gt;data:  cesd &lt;br /&gt;Test Statistic = 2.6099, p-value = 0.07465&lt;br /&gt;&lt;br /&gt;with(help, levene.test(cesd, as.factor(substance),location="median"))&lt;br /&gt;&lt;br /&gt; modified robust Brown-Forsythe Levene-type test based on the absolute &lt;br /&gt;           deviations from the median&lt;br /&gt;&lt;br /&gt;data:  cesd &lt;br /&gt;Test Statistic = 2.462, p-value = 0.08641&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=DpK4WUnksCI:z5Q66TJ0BXs:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=DpK4WUnksCI:z5Q66TJ0BXs:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=DpK4WUnksCI:z5Q66TJ0BXs:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/DpK4WUnksCI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/3900673155530590030/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=3900673155530590030" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/3900673155530590030?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/3900673155530590030?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/DpK4WUnksCI/example-936-levenes-test-for-equal.html" title="Example 9.36: Levene's test for equal variances" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/06/example-936-levenes-test-for-equal.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkECQXsyfyp7ImA9WhJTEE8.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-3220164294435446102</id><published>2012-06-18T10:11:00.005-04:00</published><updated>2012-06-18T10:11:00.597-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-06-18T10:11:00.597-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="SAS macro" /><category scheme="http://www.blogger.com/atom/ns#" term="SAS formats" /><category scheme="http://www.blogger.com/atom/ns#" term="apply()" /><category scheme="http://www.blogger.com/atom/ns#" term="sample()" /><category scheme="http://www.blogger.com/atom/ns#" term="replicate()" /><category scheme="http://www.blogger.com/atom/ns#" term="proc freq" /><category scheme="http://www.blogger.com/atom/ns#" term="capture.output()" /><category scheme="http://www.blogger.com/atom/ns#" term="rand function" /><category scheme="http://www.blogger.com/atom/ns#" term="methods()" /><category scheme="http://www.blogger.com/atom/ns#" term="matrix()" /><title>Example 9.35: Discrete randomization and formatted output</title><content type="html">A colleague asked for help with randomly choosing a kid within a family.  This is for a trial in which families are recruited at well-child visits, but in each family only one of the children having a well-child visit that day can be in the study.  The idea is that after recruiting the family, the research assistant needs to choose one child, but if they make that choice themselves, the children are unlikely to be representative.  Instead, we'll allow them to make a random decision through an easily used slip that can be put into sealed envelopes.  The envisioned process is that the RA will recruit the family, determine the number of eligible children, then open the envelope to find out which child was randomly selected.&lt;br /&gt;&lt;br /&gt;One thought here would be to generate separate stacks of envelopes for each given family size, and have the research assistant open an envelope from the appropriate stack.  However, this could be logistically challenging, especially since the RAs will spend weeks away from the home office.  Instead, we'll include all plausible family sizes on each slip of paper.  It seems unlikely that more than 5 children in a family will have well-child visits on the same day.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;We'll use the SAS example to demonstrate using SAS macros to write SAS code, as well as showing a plausible use for SAS formats (section 1.4.12) and making use of &lt;tt&gt;proc print&lt;/tt&gt;. &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;/* the following macro will write out equal probabilities for selecting &lt;br /&gt;each integer between 1 and the argument, in the format needed for the &lt;br /&gt;rand function.  E.g., if the argument is 3, &lt;br /&gt;it will write out&lt;br /&gt;1/3,1/3,1/3&lt;br /&gt;*/&lt;br /&gt;&lt;br /&gt;%macro tbls(n);&lt;br /&gt;%do i = 1 %to &amp;n;&lt;br /&gt;1/&amp;n %if &amp;i &lt; &amp;n %then ,&lt;br /&gt;%end;&lt;br /&gt;%mend tbls;&lt;br /&gt;&lt;br /&gt;/* then we can use the %tbls macro to create the randomization&lt;br /&gt;via rand("TABLE") (section 1.10.4). */ &lt;br /&gt;data kids;&lt;br /&gt;do family = 1 to 10000;&lt;br /&gt;  nkids = 2; chosen = rand("TABLE",%tbls(2)); output;&lt;br /&gt;  nkids = 3; chosen = rand("TABLE",%tbls(3)); output;&lt;br /&gt;  nkids = 4; chosen = rand("TABLE",%tbls(4)); output;&lt;br /&gt;  nkids = 5; chosen = rand("TABLE",%tbls(5)); output;&lt;br /&gt;end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;/* check randomization */&lt;br /&gt;proc freq data = kids;&lt;br /&gt;table nkids * chosen / nocol nopercent;&lt;br /&gt;run; &lt;br /&gt;&lt;br /&gt;   nkids     chosen&lt;br /&gt;&lt;br /&gt;   Frequency|&lt;br /&gt;   Row Pct  |       1|       2|       3|       4|       5|  Total&lt;br /&gt;   ---------+--------+--------+--------+--------+--------+&lt;br /&gt;          2 |  50256 |  49744 |      0 |      0 |      0 | 100000&lt;br /&gt;            |  50.26 |  49.74 |   0.00 |   0.00 |   0.00 |&lt;br /&gt;   ---------+--------+--------+--------+--------+--------+&lt;br /&gt;          3 |  33429 |  33292 |  33279 |      0 |      0 | 100000&lt;br /&gt;            |  33.43 |  33.29 |  33.28 |   0.00 |   0.00 |&lt;br /&gt;   ---------+--------+--------+--------+--------+--------+&lt;br /&gt;          4 |  25039 |  24839 |  25245 |  24877 |      0 | 100000&lt;br /&gt;            |  25.04 |  24.84 |  25.25 |  24.88 |   0.00 |&lt;br /&gt;   ---------+--------+--------+--------+--------+--------+&lt;br /&gt;          5 |  19930 |  20074 |  20188 |  20036 |  19772 | 100000&lt;br /&gt;            |  19.93 |  20.07 |  20.19 |  20.04 |  19.77 |&lt;br /&gt;   ---------+--------+--------+--------+--------+--------+&lt;br /&gt;   Total      128654   127949    78712    44913    19772   400000&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Looks pretty good.  Now we need to make the output usable to the research assistants, by formatting the results into English.  We'll use the same format for each number of kids.  This saves some keystrokes now, but may possibly cause the RAs some confusion-- it means that we might refer to the "4th oldest" of 4 children, rather than the "youngest".  We could fix this using a different format for each number of children, analogous to the R version below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;proc format;&lt;br /&gt;value chosen&lt;br /&gt;1 = "oldest"&lt;br /&gt;2 = '2nd oldest'&lt;br /&gt;3 = '3rd oldest'&lt;br /&gt;4 = '4th oldest'&lt;br /&gt;5 = '5th oldest';&lt;br /&gt;run;&lt;br /&gt; &lt;br /&gt;/* now, make a text variable the concatenates (section 1.4.5) the variables &lt;br /&gt;and some explanatory text */&lt;br /&gt;data k2;&lt;br /&gt;set kids;&lt;br /&gt;if nkids eq 2 then&lt;br /&gt;  t1 = "If there are " || strip(nkids) ||" children then choose the " ||&lt;br /&gt;       strip(put(chosen,chosen.)) || " child.";&lt;br /&gt;else&lt;br /&gt;  t1 = "             " || strip(nkids) ||" ________________________ " ||&lt;br /&gt;       strip(put(chosen,chosen.));&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;/* then we print.  Notice the options to print in plain text, shorten the &lt;br /&gt;page length and width, and remove the date and page number from the SAS output, as&lt;br /&gt;well as in the proc print statement to remove the observation number and&lt;br /&gt;show the line number, with a few other tricks */&lt;br /&gt;options nonumber nodate ps = 60 ls = 68;&lt;br /&gt;OPTIONS FORMCHAR="|----|+|---+=|-/\&lt;&gt;*";&lt;br /&gt;proc print data = k2 (obs = 3) noobs label sumlabel;&lt;br /&gt;by family;&lt;br /&gt;var t1;&lt;br /&gt;label t1 = '00'x family = "Envelope";&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;---------------------------- Envelope=1 ----------------------------&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;     If there are 2 children then choose the 2nd oldest child.&lt;br /&gt;                  3 ________________________ 3rd oldest&lt;br /&gt;                  4 ________________________ 4th oldest&lt;br /&gt;                  5 ________________________ 5th oldest&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;---------------------------- Envelope=2 ----------------------------&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;     If there are 2 children then choose the 2nd oldest child.&lt;br /&gt;                  3 ________________________ oldest&lt;br /&gt;                  4 ________________________ oldest&lt;br /&gt;                  5 ________________________ 3rd oldest&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;---------------------------- Envelope=3 ----------------------------&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;     If there are 2 children then choose the 2nd oldest child.&lt;br /&gt;                  3 ________________________ 2nd oldest&lt;br /&gt;                  4 ________________________ 3rd oldest&lt;br /&gt;                  5 ________________________ 2nd oldest&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;For R, we leave some trial code in place, to demonstrate how one might discover, test, and build R code in this setting.  Most results have been omitted.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;sample(5, size = 1)   &lt;br /&gt;# choose a (discrete uniform) random integer between 1 and 5&lt;br /&gt;&lt;br /&gt;apply(matrix(2:5),1,sample,size=1)   &lt;br /&gt;# choose a random integer between 1 and 2, then between 1 and 3, etc., &lt;br /&gt;# using apply() to repeat the call to sample() with different maximum number&lt;br /&gt;# apply() needs a matrix or array input&lt;br /&gt;# result of this is the raw data needed for one family&lt;br /&gt;&lt;br /&gt;replicate(3,apply(matrix(2:5),1,sample,size=1))&lt;br /&gt;# replicate() is in the apply() family and just repeats the &lt;br /&gt;# function n times&lt;br /&gt;&lt;br /&gt;     [,1] [,2] [,3]&lt;br /&gt;[1,]    2    1    2&lt;br /&gt;[2,]    2    1    2&lt;br /&gt;[3,]    2    2    2&lt;br /&gt;[4,]    3    5    4&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Now we have the raw data for the envelopes.  Before formatting it for printing, let's check it to make sure it works correctly.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;test=replicate(100000, apply(matrix(2:5), 1, sample, size=1))&lt;br /&gt;apply(test, 1, summary)&lt;br /&gt;        [,1] [,2]  [,3]  [,4]&lt;br /&gt;Min.     1.0    1 1.000 1.000&lt;br /&gt;1st Qu.  1.0    1 1.000 2.000&lt;br /&gt;Median   1.0    2 2.000 3.000&lt;br /&gt;Mean     1.5    2 2.492 3.003&lt;br /&gt;3rd Qu.  2.0    3 3.000 4.000&lt;br /&gt;Max.     2.0    3 4.000 5.000&lt;br /&gt;# this is not so helpful-- need the count or percent for each number&lt;br /&gt;# this would be the default if the data were factors, but they aren't&lt;br /&gt;# check to see if we can trick summary() into treating these integers&lt;br /&gt;# as if they were factors&lt;br /&gt;methods(summary)&lt;br /&gt;# yes, there's a summary() method for factors-- let's apply it&lt;br /&gt;# there's also apply(test,1,table) which might be better, if you remember it&lt;br /&gt;apply(test, 1, summary.factor)&lt;br /&gt;[[1]]&lt;br /&gt;    1     2 &lt;br /&gt;50025 49975 &lt;br /&gt;&lt;br /&gt;[[2]]&lt;br /&gt;    1     2     3 &lt;br /&gt;33329 33366 33305 &lt;br /&gt;&lt;br /&gt;[[3]]&lt;br /&gt;    1     2     3     4 &lt;br /&gt;25231 25134 24849 24786 &lt;br /&gt;&lt;br /&gt;[[4]]&lt;br /&gt;    1     2     3     4     5 &lt;br /&gt;19836 20068 20065 20022 20009 &lt;br /&gt;# apply(test,1,table) will give similar results, if you remember it&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Well, that's not too pretty, but it's clear that the randomization is working.  Now it's time to work on formatting the output.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;mylist=replicate(5, apply(matrix(2:5), 1, sample, size=1))&lt;br /&gt;# brief example data set&lt;br /&gt;&lt;br /&gt;# We'll need to use some formatted values (section 1.14.12), as in SAS. &lt;br /&gt;# Here, we'll make new value labels for each number of children,&lt;br /&gt;# which will make the output easier to read.  We add in an envelope &lt;br /&gt;# number and wrap it all into a data frame.&lt;br /&gt;df = data.frame(envelope = 1:5,&lt;br /&gt;   twokids=factor(mylist[1,],1:2,labels=c("youngest","oldest")),&lt;br /&gt;  threekids=factor(mylist[2,],1:3,labels=c("youngest", "middle", "oldest")),&lt;br /&gt;  fourkids=factor(mylist[3,],1:4,labels=c("youngest", "second youngest", &lt;br /&gt;      "second oldest", "oldest")),&lt;br /&gt;  fivekids=factor(mylist[4,],1:5,labels=c("youngest", "second youngest", &lt;br /&gt;      "middle", "second oldest", "oldest"))&lt;br /&gt;)&lt;br /&gt;&lt;br /&gt;# now we need a function to take a row of the data frame and make a single slip&lt;br /&gt;# the paste() function (section 1.4.5) puts together the fixed and variable &lt;br /&gt;# content of each row, while the cat() function will print it without quotes&lt;br /&gt;slip = function(kidvec) {&lt;br /&gt;  cat(paste("------------- Envelope", kidvec[1], "------------------"))&lt;br /&gt;  cat(paste("\nIf there are", 2:5, " children, select the", kidvec[2:5],"child"))&lt;br /&gt;  cat("\n \n \n")&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;# test it on one row&lt;br /&gt;slip(df[1,])&lt;br /&gt;&lt;br /&gt;# looks good-- now we can apply() it to each row of the data frame&lt;br /&gt;apply(df, 1, slip)&lt;br /&gt;&lt;br /&gt;------------- Envelope 1 ------------------&lt;br /&gt;If there are 2  children, select the youngest child &lt;br /&gt;If there are 3  children, select the youngest child &lt;br /&gt;If there are 4  children, select the second youngest child &lt;br /&gt;If there are 5  children, select the youngest child&lt;br /&gt; &lt;br /&gt; &lt;br /&gt;------------- Envelope 2 ------------------&lt;br /&gt;If there are 2  children, select the youngest child &lt;br /&gt;If there are 3  children, select the youngest child &lt;br /&gt;If there are 4  children, select the second oldest child &lt;br /&gt;If there are 5  children, select the middle child&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;------------- Envelope 3 ------------------&lt;br /&gt;If there are 2  children, select the youngest child &lt;br /&gt;If there are 3  children, select the youngest child &lt;br /&gt;If there are 4  children, select the youngest child &lt;br /&gt;If there are 5  children, select the second youngest child&lt;br /&gt;&lt;br /&gt;# and so forth&lt;br /&gt;&lt;br /&gt;# finally, we can save the result in a file with&lt;br /&gt;# capture.output()&lt;br /&gt;capture.output(apply(df,1,slip), file="testslip.txt")&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;&lt;br /&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=6alwbAkZIyQ:qu0ApUjyatE:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=6alwbAkZIyQ:qu0ApUjyatE:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=6alwbAkZIyQ:qu0ApUjyatE:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/6alwbAkZIyQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/3220164294435446102/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=3220164294435446102" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/3220164294435446102?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/3220164294435446102?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/6alwbAkZIyQ/example-935-discrete-randomization-and.html" title="Example 9.35: Discrete randomization and formatted output" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/06/example-935-discrete-randomization-and.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ak4FRX04eSp7ImA9WhVbGU4.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-2523903449054764715</id><published>2012-06-05T15:59:00.003-04:00</published><updated>2012-06-05T19:28:34.331-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-06-05T19:28:34.331-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="set.seed()" /><category scheme="http://www.blogger.com/atom/ns#" term="SAS macro" /><category scheme="http://www.blogger.com/atom/ns#" term="vref optioncall symput" /><category scheme="http://www.blogger.com/atom/ns#" term="deparse(substitute()" /><category scheme="http://www.blogger.com/atom/ns#" term="assessing differences" /><category scheme="http://www.blogger.com/atom/ns#" term="substitute function" /><category scheme="http://www.blogger.com/atom/ns#" term="proc standard" /><category scheme="http://www.blogger.com/atom/ns#" term="paste()" /><category scheme="http://www.blogger.com/atom/ns#" term="Bland-Altman plot" /><title>Example 9.34: Bland-Altman type plot</title><content type="html">&lt;a href="http://3.bp.blogspot.com/-HVZPawe1U5U/T84-VYRxSvI/AAAAAAAADoI/yjH7N5PHFnE/s1600/baplot.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 300px;" src="http://3.bp.blogspot.com/-HVZPawe1U5U/T84-VYRxSvI/AAAAAAAADoI/yjH7N5PHFnE/s1600/baplot.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5750602311213861618" /&gt;&lt;/a&gt;&lt;br /&gt;The &lt;a href="http://en.wikipedia.org/wiki/Bland%E2%80%93Altman_plot"&gt;Bland-Altman plot&lt;/a&gt; is a visual aid for assessing differences between two ways of measuring something.  For example, one might compare two scales this way, or two devices for measuring particulate matter.  &lt;br /&gt;&lt;br /&gt;The plot simply displays the difference between the measures against their average. Rather than a statistical test, it is intended to demonstrate both typical differences between the measures and any patterns such differences may take.  The utility of the plot, as compared with linear regression or sample correlation is that the plot is not affected by the range, while the sample correlation will typically increase with the range.  In contrast, linear regression shows the strength of the linear association but not how closely the two measures agree.  The Bland-Altman plot allows the user to focus on differences between the measures, perhaps focusing on the clinical relevance of these differences.&lt;br /&gt;&lt;br /&gt;A peer reviewer recently asked a colleague to consider a Bland-Altman plot for two methods of assessing fatness: the familiar BMI (kg/m^2) and the actual fat mass measured by a sophisticated &lt;a href="http://en.wikipedia.org/wiki/Dual-energy_X-ray_absorptiometry"&gt;DXA&lt;/a&gt; machine.  These are obviously not measures of the same thing, so a Bland-Altman plot is not exactly appropriate.  But since the BMI is so simple to calculate and the DXA machine is so expensive, it would be nice if the BMI could be substituted for DXA fat mass.  &lt;br /&gt;&lt;br /&gt;For this purpose, we'll generate a modified Bland-Altman plot in which each measure is first standardized to have mean 0 and standard deviation 1.  The resulting plot should be assessed for pattern as usual, but typical differences must be considered on the standardized scale-- that is, differences of a unit should be considered large, and good agreement might require typical differences of 0.2 or less.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;Since this is a job we might want to repeat, we'll build a SAS macro to do it.  This will also demonstrate some useful features.  The macro accepts a data set name and the names of two variables as input.  We'll comment on interesting features in code comments.  If you're an R coder, note that SAS macro variables are merely text, not objects.  We have to manually assign "values" (i.e., numbers represented as text strings) to newly created macro variables.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;%macro baplot(datain=,x=x,y=y);&lt;br /&gt;&lt;br /&gt;/* proc standard  standardizes the variables and saves the results in the&lt;br /&gt;   same variable names in the output data set.  This means we can continue &lt;br /&gt;   using the input variable names throughout. */&lt;br /&gt;proc standard data = &amp;datain out=ba_std mean=0 std=1;&lt;br /&gt;var &amp;x &amp;y;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;/* calculate differences and averages */&lt;br /&gt;data ba;&lt;br /&gt;set ba_std;&lt;br /&gt;bamean = (&amp;x + &amp;y)/2;;&lt;br /&gt;badiff = &amp;y-&amp;x;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;ods output summary=basumm;&lt;br /&gt;ods select none;&lt;br /&gt;proc means data = ba mean std;&lt;br /&gt;var badiff;&lt;br /&gt;run;&lt;br /&gt;ods select all;&lt;br /&gt;&lt;br /&gt;/* In the following, we take values calculated from a data set for the &lt;br /&gt;   confidence limits and store them in macro variables.  That's the &lt;br /&gt;   only way to use them later in code.&lt;br /&gt;   The syntax is: call symput('varname', value);&lt;br /&gt;   Note that 'bias' is purely nominal, as the standardization means that &lt;br /&gt;   the mean difference is 0. */&lt;br /&gt;data lines;&lt;br /&gt;set basumm;&lt;br /&gt;call symput('bias',badiff_mean);&lt;br /&gt;call symput('hici',badiff_mean+(1.96 * badiff_stddev));&lt;br /&gt;call symput('loci',badiff_mean-(1.96 * badiff_stddev));&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;/* We use the macro variables just created in the vref= option below;&lt;br /&gt;   vref draws reference line(s) on the vertical axis.  lvref specifies&lt;br /&gt;   a line type. */&lt;br /&gt;symbol1 i = none v = dot h = .5;&lt;br /&gt;title "Bland-Altman type plot of &amp;x and &amp;y";&lt;br /&gt;title2 "&amp;x and &amp;y standardized";&lt;br /&gt;proc gplot data=ba;&lt;br /&gt;plot badiff * bamean / vref = &amp;bias &amp;hici &amp;loci lvref=3;&lt;br /&gt;label badiff = "difference" bamean="mean";&lt;br /&gt;run;&lt;br /&gt;%mend baplot;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Here is a fake sample data set, with the plot resulting from the macro shown above.  An analysis would suggest that despite the correlation of 0.59 and p-value for the linear association &lt; .0001, that these two measures don't agree too well.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data fake;&lt;br /&gt;do i = 1 to 50;&lt;br /&gt;/* the "42" in the code below sets the seed for the pseudo-RNG&lt;br /&gt;   for this and later calls.  See section 1.10.9. */&lt;br /&gt;  x = normal(42);&lt;br /&gt;  y = x + normal(0);&lt;br /&gt;  output;&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;%baplot(datain=fake, x=x, y=y);&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;Paralleling SAS, we'll write a small function to draw the plot, annotating within to highlight some details.  If you're primarily a SAS coder, note the syntax needed to find the name of an object submitted to a function.  In contrast, assigning values to new objects created with the function is entirely natural.  The resulting plot is shown below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;# set seed, for replicability&lt;br /&gt;set.seed(42)&lt;br /&gt;x = rnorm(50)&lt;br /&gt;y = x + rnorm(50) &lt;br /&gt;&lt;br /&gt;baplot = function(x,y){&lt;br /&gt;  xstd = (x - mean(x))/sd(x)&lt;br /&gt;  ystd = (y - mean(y))/sd(y)&lt;br /&gt;  &lt;br /&gt;  bamean = (xstd+ystd)/2&lt;br /&gt;  badiff = (ystd-xstd)&lt;br /&gt;  &lt;br /&gt;  plot(badiff~bamean, pch=20, xlab="mean", ylab="difference")&lt;br /&gt;# in the following, the deparse(substitute(varname)) is what retrieves the&lt;br /&gt;# name of the argument as data&lt;br /&gt;  title(main=paste("Bland-Altman plot of x and y\n",&lt;br /&gt;    deparse(substitute(x)), "and", deparse(substitute(y)),&lt;br /&gt;    "standardized"), adj=".5")&lt;br /&gt;#construct the reference lines on the fly: no need to save the values in new &lt;br /&gt;# variable names&lt;br /&gt;  abline(h = c(mean(badiff), mean(badiff)+1.96 * sd(badiff),&lt;br /&gt;    mean(badiff)-1.96 * sd(badiff)), lty=2)&lt;br /&gt;} &lt;br /&gt;&lt;br /&gt;baplot(x,y)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://2.bp.blogspot.com/-dNCQC1Z1piM/T85BB1LmIGI/AAAAAAAADoY/FdyUV-YDzGk/s1600/baplot_r.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 800px; height: 600px;" src="http://2.bp.blogspot.com/-dNCQC1Z1piM/T85BB1LmIGI/AAAAAAAADoY/FdyUV-YDzGk/s1600/baplot_r.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5750605273910091874" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=65rCG5dtROM:dLfJvEgkKAA:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=65rCG5dtROM:dLfJvEgkKAA:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=65rCG5dtROM:dLfJvEgkKAA:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/65rCG5dtROM" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/2523903449054764715/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=2523903449054764715" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2523903449054764715?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2523903449054764715?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/65rCG5dtROM/example-934-bland-altman-type-plot.html" title="Example 9.34: Bland-Altman type plot" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-HVZPawe1U5U/T84-VYRxSvI/AAAAAAAADoI/yjH7N5PHFnE/s72-c/baplot.png" height="72" width="72" /><thr:total>4</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/06/example-934-bland-altman-type-plot.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUMMQXwzcSp7ImA9WhVbEkQ.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-8741495080083550874</id><published>2012-05-29T09:18:00.012-04:00</published><updated>2012-05-29T09:18:00.289-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-29T09:18:00.289-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="rounding" /><category scheme="http://www.blogger.com/atom/ns#" term="pool()" /><category scheme="http://www.blogger.com/atom/ns#" term="mice package" /><category scheme="http://www.blogger.com/atom/ns#" term="predictive mean matching" /><category scheme="http://www.blogger.com/atom/ns#" term="mice()" /><category scheme="http://www.blogger.com/atom/ns#" term="proc mianalyze" /><category scheme="http://www.blogger.com/atom/ns#" term="proc mi" /><category scheme="http://www.blogger.com/atom/ns#" term="rand function" /><category scheme="http://www.blogger.com/atom/ns#" term="rnorm()" /><category scheme="http://www.blogger.com/atom/ns#" term="multiple imputation" /><category scheme="http://www.blogger.com/atom/ns#" term="Stuart Lipsitz" /><title>Example 9.33: Multiple imputation, rounding, and bias</title><content type="html">Nick has a &lt;a href="http://www.biostat.harvard.edu/~horton/tasround.pdf"&gt;paper&lt;/a&gt; in the American Statistician warning about bias in multiple imputation arising from rounding data imputed under a normal assumption.  One example where you might run afoul of this is if the data are truly dichotomous or count variables, but you model it as normal (either because your software is unable to model dichotomous values directly or because you prefer the theoretical soundness of multivariate normal imputation to, e.g., chained equations).  In such cases, one might impute assuming normality, then round the imputed values to plausible integers.  The paper shows theoretically the bias that can result if this process is pursued, and also that allowing the "implausible values" will eliminate the bias.  (Of course, modeling the missing variable using a logistic regression model will be most appropriate here).&lt;br /&gt;&lt;br /&gt;In another &lt;a href="http://www.biostat.harvard.edu/~horton/tasimpute.pdf"&gt;paper&lt;/a&gt;, Nick and Stuart Lipsitz (TAS 2001) comment that the method of predictive mean matching (PMM) "ensures that imputed values are plausible, and may be more appropriate if the normality assumption is violated."  Briefly, the PMM method predicts a value from a model for both missing and observed values.  The imputation for a subject with a missing value is the observed value of the subject with the nearest predicted value (or random draw of observed values from among the subjects with the nearest predicted values).  &lt;br /&gt;&lt;br /&gt;How does this play out in practice?  Can the PMM method overcome the theoretical rounding bias while still generating only plausible imputed values?&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;We begin by simulating dichotomous data, choosing the value of &lt;tt&gt;p&lt;/tt&gt; (probability of 1) = .25, a value with a large absolute bias, according to the paper.  We set values to missing with probability 0.5, using a MCAR mechanism.  Then we use &lt;tt&gt;proc mi&lt;/tt&gt; (section 6.5, &lt;a href="http://sas-and-r.blogspot.com/2011/09/example-94-new-stuff-in-sas-93-mi-fcs.html"&gt;example 9.4&lt;/a&gt;) to impute the missing values, assuming normality.  The mean and standard error of the mean of y are calculated in &lt;tt&gt;proc summary&lt;/tt&gt; (section 2.1.1) and combined in &lt;tt&gt;proc mianalyze&lt;/tt&gt;.  Then the values are rounded manually and the analysis repeated.  Next, we impute separately with PMM.  Finally, we impute again with a logistic imputation.  We use 5 imputations throughout, though 50 would likely be preferable.&lt;br /&gt;&lt;br /&gt;Note that a Poisson regression imputation is not yet available for &lt;tt&gt;proc mi&lt;/tt&gt;, so that the exercise is not wholly academic--if you needed to impute count values, you'd have to choose among implausible values, rounding, and PMM.  Also note our use of the &lt;tt&gt;fcs&lt;/tt&gt; imputation method, though it is not needed here with an obviously monotone missingness pattern.  Finally, note that &lt;tt&gt;proc mi&lt;/tt&gt; here requires at least two variables, for no reason we know of.  We generate a normally-distributed and uncorrelated covariate.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data testpmm;&lt;br /&gt;do i = 1 to 5000;&lt;br /&gt;  x = normal(0);&lt;br /&gt;  y = rand('BINOMIAL', .25, 1);&lt;br /&gt;  missprob = ranuni(0);&lt;br /&gt;  if missprob le .5 then y = .;&lt;br /&gt;  output;&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;title "Normal imputation";&lt;br /&gt;proc mi data=testpmm out=normal nimpute=5;&lt;br /&gt;var x y;&lt;br /&gt;fcs reg;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;title2 "Implausible values";&lt;br /&gt;proc summary data = normal mean stderr;&lt;br /&gt;by _imputation_;&lt;br /&gt;var y;&lt;br /&gt;output out=outnormal mean=meany stderr=stderry;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc mianalyze data = outnormal;&lt;br /&gt;modeleffects meany;&lt;br /&gt;stderr stderry;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;title2 "Rounded";&lt;br /&gt;/* make the rounded data */&lt;br /&gt;data normalrnd;&lt;br /&gt;set normal;&lt;br /&gt;if y lt .5 then y=0;&lt;br /&gt;else y=1;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc summary data = normalrnd mean stderr;&lt;br /&gt;by _imputation_;&lt;br /&gt;var y;&lt;br /&gt;output out=outnormalrnd mean=meany stderr=stderry;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc mianalyze data = outnormalrnd;&lt;br /&gt;modeleffects meany;&lt;br /&gt;stderr stderry;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;title "regpmm imputation";&lt;br /&gt;proc mi data=testpmm out=pmm nimpute=5;&lt;br /&gt;var x y;&lt;br /&gt;fcs regpmm;&lt;br /&gt;run;&lt;br /&gt;...&lt;br /&gt;&lt;br /&gt;title "logistic imputation";&lt;br /&gt;proc mi data=testpmm out=logistic nimpute=5;&lt;br /&gt;class y;&lt;br /&gt;var x y;&lt;br /&gt;fcs logistic;&lt;br /&gt;run;&lt;br /&gt;...&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;We omit the &lt;tt&gt;summary&lt;/tt&gt; and &lt;tt&gt;mianalyze&lt;/tt&gt; procedures for the latter imputations.  Ordinarily, it would be easiest to do this kind of repetitive task with a macro, but we leave it in open code here for legibility.&lt;br /&gt;The results are shown below&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;                          Normal imputation-- Implausible values&lt;br /&gt;&lt;br /&gt;          Parameter        Estimate      Std Error    95% Confidence Limits  &lt;br /&gt;          meany            0.249105       0.008634     0.230849     0.267362&lt;br /&gt;    &lt;br /&gt;                                Normal imputation-- Rounded &lt;br /&gt;          meany            0.265280       0.006408     0.252710     0.277850&lt;br /&gt;    &lt;br /&gt;                                       regpmm imputation      &lt;br /&gt;          meany            0.246320       0.006642     0.233204     0.259436  &lt;br /&gt;&lt;br /&gt;                                      logistic imputation     &lt;br /&gt;          meany            0.255120       0.008428     0.237449     0.272791&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;As theory suggests, rounding the normally imputed values leads to bias, while using the normal imputations does not (though it results in implausible values).  Nether PMM imputation nor direct logistic imputation appear to be biased.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;We will use the mice package written by &lt;a href="http://www.stefvanbuuren.nl/"&gt;Stef van Buuren&lt;/a&gt;, one of the key developers of chained imputation.  Stef also has a new &lt;a href="http://www.amazon.com/gp/product/1439868247/ref=as_li_ss_tl?ie=UTF8&amp;tag=sasandrblog-20&amp;linkCode=as2&amp;camp=1789&amp;creative=390957&amp;creativeASIN=1439868247"&gt;book&lt;/a&gt; describing the package and demonstrating its use in many applied examples.  We use 5 imputations throughout, though 50 would likely be preferable.&lt;br /&gt;&lt;br /&gt;We begin by creating the data.  Note that &lt;tt&gt;mice()&lt;/tt&gt;, like &lt;tt&gt;proc mi&lt;/tt&gt;, requires at least two columns of data.  To do the logistic regression imputation, &lt;tt&gt;mice()&lt;/tt&gt; wants the missing data to be a factor, so we make a copy of the data as a data frame object as well.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;library(mice)&lt;br /&gt;n = 5000  # number of observations&lt;br /&gt;m = 5   # number of imputations (should be 25-50 in practice)&lt;br /&gt;x = rnorm(n)&lt;br /&gt;y = rbinom(n, 1, .25)   # interesting point according to Horton and Lipsitz (TAS 2004)&lt;br /&gt;unif = runif(n)&lt;br /&gt;y[unif &lt; .5] = NA    # make half of the Y's be missing&lt;br /&gt;ds = cbind(y, x)&lt;br /&gt;ds2 = data.frame(factor(y), x)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The mice package works analogously to &lt;tt&gt;proc mi&lt;/tt&gt;/&lt;tt&gt;proc mianalyze&lt;/tt&gt;.  The &lt;tt&gt;mice()&lt;/tt&gt; function performs the imputation, while the &lt;tt&gt;pool()&lt;/tt&gt; function summarizes the results across the completed data sets.  The &lt;tt&gt;method&lt;/tt&gt; option to &lt;tt&gt;mice()&lt;/tt&gt; specifies an imputation method for each column in the input object.  Here we fit the simplest linear regression model (intercept only).&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;# normal model with implausible values&lt;br /&gt;impnorm = mice(ds, method="norm", m=m)&lt;br /&gt;summary(pool(with(impnorm, lm(y ~ 1))))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Rounding could be done by tampering with the mids-type object that &lt;tt&gt;mice()&lt;/tt&gt; produces, but there is a more direct way to do this through the &lt;tt&gt;post=&lt;/tt&gt; option.  It accepts text strings with R commands that will be applied to the imputed values.  Here we use the &lt;tt&gt;ifelse()&lt;/tt&gt; function to make the normal values equal to 0 or 1.  The code for the predictive mean matching and logistic regression follow.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;impnormround = mice(ds, method="norm", m=m, &lt;br /&gt;   post= c("imp[[j]][,i] = ifelse(imp[[j]][,i] &lt; .5, 0, 1)",""))&lt;br /&gt;&lt;br /&gt;imppmm = mice(ds, method="pmm", m=m)&lt;br /&gt;&lt;br /&gt;implog = mice(ds2, method="logreg", m=m)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The results of &lt;tt&gt;summary(pool())&lt;/tt&gt; calls are shown below..&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; summary(pool(with(impnorm, lm(y ~ 1))))&lt;br /&gt;                 est          se     lo 95     hi 95 &lt;br /&gt;(Intercept) 0.272912 0.007008458 0.2589915 0.2868325&lt;br /&gt;&gt; summary(pool(with(impnormround, lm(y ~ 1))))&lt;br /&gt;                est         se     lo 95     hi 95 &lt;br /&gt;(Intercept) 0.28544 0.00854905 0.2676263 0.3032537&lt;br /&gt;&gt; summary(pool(with(imppmm, lm(y ~ 1))))&lt;br /&gt;                 est         se     lo 95     hi 95&lt;br /&gt;(Intercept) 0.277636 0.03180604 0.2145564 0.3407156&lt;br /&gt;&gt; summary(pool(with(implog, lm(y ~ 1))))&lt;br /&gt;                  est         se     lo 95     hi 95 &lt;br /&gt;(Intercept) 0.2652899 0.00879988 0.2480342 0.2825457&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The message on bias is similar, though there is some hint of trouble in the CI for the PMM method (it seems to have a bias towards 0.5).  The default option of 3 donors may be too few (this can be tweaked by use of the &lt;tt&gt;donors = NUMBER&lt;/tt&gt; option).&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=m6jeV4DcNPw:2noytXjGh-Q:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=m6jeV4DcNPw:2noytXjGh-Q:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=m6jeV4DcNPw:2noytXjGh-Q:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/m6jeV4DcNPw" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/8741495080083550874/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=8741495080083550874" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8741495080083550874?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/8741495080083550874?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/m6jeV4DcNPw/example-933-multiple-imputation.html" title="Example 9.33: Multiple imputation, rounding, and bias" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/05/example-933-multiple-imputation.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck4CQX86cSp7ImA9WhVUFkw.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-2142577804267916159</id><published>2012-05-21T10:36:00.024-04:00</published><updated>2012-05-21T10:36:00.119-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-21T10:36:00.119-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="SAS macro" /><category scheme="http://www.blogger.com/atom/ns#" term="proc multtest" /><category scheme="http://www.blogger.com/atom/ns#" term="apply()" /><category scheme="http://www.blogger.com/atom/ns#" term="ifelse()" /><category scheme="http://www.blogger.com/atom/ns#" term="simulation studies" /><category scheme="http://www.blogger.com/atom/ns#" term="p.adjust()" /><category scheme="http://www.blogger.com/atom/ns#" term="proc transpose" /><category scheme="http://www.blogger.com/atom/ns#" term="matrix()" /><title>Example 9.32: Multiple testing simulation</title><content type="html">In examples &lt;a href="http://sas-and-r.blogspot.com/2012/05/example-930-addressing-multiple.html"&gt;9.30&lt;/a&gt; and &lt;a href="http://sas-and-r.blogspot.com/2012/05/example-931-exploring-multiple-testing.html"&gt;9.31&lt;/a&gt; we explored corrections for multiple testing and then extracting p-values adjusted by the Benjamini and Hochberg (or FDR) procedure.  In this post we'll develop a simulation to explore the impact of "strong" and "weak" control of the family-wise error rate offered in multiple comparison corrections.  Loosely put, weak control procedures may fail when some of the null hypotheses are actually false, in that the remaining (true) nulls may be rejected more than the nominal proportion of times.&lt;br /&gt;&lt;br /&gt;For our simulation, we'll develop flexible code to generate some p-values from false nulls and others from true nulls.  We'll assume that the true nulls have p-values distributed uniform (0,1); the false nulls will have p-values distributed uniform with a user-determined maximum.  We'll also allow the number of tests overall and the number of false nulls to be set.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;In SAS, a macro does the job.  It accepts the user parameters described above, then generates false and true nulls for each desired simulation.  With the data created, we can use &lt;tt&gt;proc multtest&lt;/tt&gt; to apply the FDR procedure, with the &lt;tt&gt;ODS&lt;/tt&gt; system saving the results.  Note how the &lt;tt&gt;by&lt;/tt&gt; statement allows us to replicate the analysis for each simulated set of p-values without creating a separate data set for each one.  (Also note that we do not use &lt;tt&gt;proc sort&lt;/tt&gt; before that &lt;tt&gt;by&lt;/tt&gt; statement-- this can be risky, but works fine here.)&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;%macro fdr(nsims=1, ntests = 20, nfalse=10, howfalse=.01);&lt;br /&gt;ods select none;&lt;br /&gt;data test;&lt;br /&gt;do sim = 1 to &amp;nsims;&lt;br /&gt;  do i = 1 to &amp;ntests;&lt;br /&gt;    raw_p = uniform(0) * &lt;br /&gt;      ( ((i le &amp;nfalse) * &amp;howfalse ) + ((i gt &amp;nfalse) * 1 ) );&lt;br /&gt;    output;&lt;br /&gt;  end;&lt;br /&gt;end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;ods output pvalues = __pv;&lt;br /&gt;proc multtest inpvalues=test fdr;&lt;br /&gt;by sim;&lt;br /&gt;run; &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;With the results in hand, (still within the macro) we need to do some massaging to make the results usable.  First we'll recode the rejections (assuming a 0.05 alpha level) so that non-rejections are 0 and rejections are 1/number of tests.  That way we can just sum across the results to get the proportion of rejections.  Next, we transform the data to get each simulation in a row (section 1.5.4).  (The data output from &lt;tt&gt;proc multtest&lt;/tt&gt; has nsims*ntests rows.  After transposing, there are nsims rows.)  Finally, we can sum across the rows to get the proportion of tests rejected in each simulated family of tests.  The results are shown in a table made with &lt;tt&gt;proc freq&lt;/tt&gt;.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data __pv1;&lt;br /&gt;set __pv;&lt;br /&gt;if falsediscoveryrate lt 0.05 then fdrprop = 1/&amp;ntests;&lt;br /&gt;else fdrprop =0;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc transpose data = __pv1 (keep =sim fdrprop) out = pvals_a;&lt;br /&gt;by sim; run;&lt;br /&gt;&lt;br /&gt;data pvals;&lt;br /&gt;set pvals_a;&lt;br /&gt;prop = sum(of col1 - col&amp;ntests);&lt;br /&gt;run;&lt;br /&gt;ods select all;&lt;br /&gt;&lt;br /&gt;proc freq data = pvals; tables prop; run;&lt;br /&gt;%mend fdr;&lt;br /&gt;&lt;br /&gt;%fdr(nsims = 1000, ntests = 20, nfalse = 10, howfalse=.001);&lt;br /&gt;&lt;br /&gt;                                      Cumulative    Cumulative&lt;br /&gt;     prop    Frequency     Percent     Frequency      Percent&lt;br /&gt;     ---------------------------------------------------------&lt;br /&gt;      0.5         758       75.80           758        75.80&lt;br /&gt;     0.55         210       21.00           968        96.80&lt;br /&gt;      0.6          27        2.70           995        99.50&lt;br /&gt;     0.65           5        0.50          1000       100.00&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;So true nulls were rejected 24% of the time, which seems like a lot.  Multiple comparison procedures with "strong" control of the familywise error rate will reject them only 5% of the time.  Building this simulation as a macro facilitates exploring the effects of the multiple comparison procedures in a variety of settings. &lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;As in example 9.31, the R code is rather simpler, though perhaps a bit opaque.  To make the p-values, we make them first for all of tests with the false, then for all of the tests with the true nulls.  The &lt;tt&gt;matrix&lt;/tt&gt; function reads these in by column, by default, meaning that the first nfalse columns get the nsims*nfalse observations.  The &lt;tt&gt;apply&lt;/tt&gt; function generates the FDR p-values for each row of the data set.  The &lt;tt&gt;t()&lt;/tt&gt; function just transposes the resulting matrix so that we get back a row for each simulation.  As in the SAS version, we'll count each rejection as 1/ntests, and non-rejections as 0; we do this with the &lt;tt&gt;ifelse()&lt;/tt&gt; statement.  Then we sum across the simulations with another call to &lt;tt&gt; apply()&lt;/tt&gt; and show the results with a simple table.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;checkfdr = function(nsims=1, ntests=100, nfalse=0, howfalse=0.001) {&lt;br /&gt;  raw_p = matrix(c(runif(nfalse * nsims) * howfalse, &lt;br /&gt;                   runif((ntests-nfalse) * nsims)), nrow=nsims)&lt;br /&gt;  fdr = t(apply(raw_p, 1, p.adjust, "fdr"))&lt;br /&gt;  reject = ifelse(fdr&lt;.05, 1/ntests,0)&lt;br /&gt;  prop = apply(reject, 1, sum)&lt;br /&gt;  prop.table(table(prop)) &lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;&gt; checkfdr(nsims=1000, ntests=20, nfalse=10, howfalse=.001)&lt;br /&gt;prop&lt;br /&gt;  0.5  0.55   0.6  0.65 &lt;br /&gt;0.755 0.210 0.032 0.003 &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The results are reassuringly similar to those from SAS.  In this R code, it's particularly simple to try a different test-- just replace "fdr" in the &lt;tt&gt;p.adjust()&lt;/tt&gt; call.  Here's the result with the Hochberg test, which has strong control.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;checkhoch = function(nsims=1, ntests=100, nfalse=0, howfalse=0.001) {&lt;br /&gt;   pvals = matrix(c(runif(nfalse * nsims) * howfalse, &lt;br /&gt;                    runif((ntests-nfalse) * nsims)), nrow=nsims)&lt;br /&gt;   hochberg = t(apply(pvals, 1, p.adjust,"hochberg"))&lt;br /&gt;   reject = ifelse(hochberg&lt;.05,1/ntests,0)&lt;br /&gt;   prop = apply(reject, 1, sum)&lt;br /&gt;   prop.table(table(prop)) &lt;br /&gt;}&lt;br /&gt; &lt;br /&gt;&gt; checkhoch(nsims=1000, ntests=20, nfalse=10, howfalse=.001)&lt;br /&gt;prop&lt;br /&gt;  0.5  0.55   0.6 &lt;br /&gt;0.951 0.046 0.003&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;With this procedure one or more of the true nulls is rejected an appropriate 4.9% of the time. For the most part, we feel more comfortable using multiple testing procedures with "strong control".&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;&lt;br /&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt;, &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt;, and &lt;a href="http://www.statsblogs.com/"&gt;statsblogs&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=9Kqd9zqKbeo:zIhc2IDZL5w:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=9Kqd9zqKbeo:zIhc2IDZL5w:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=9Kqd9zqKbeo:zIhc2IDZL5w:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/9Kqd9zqKbeo" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/2142577804267916159/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=2142577804267916159" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2142577804267916159?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/2142577804267916159?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/9Kqd9zqKbeo/example-932-multiple-testing-simulation.html" title="Example 9.32: Multiple testing simulation" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/05/example-932-multiple-testing-simulation.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0UHRHg_eCp7ImA9WhVUEEw.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-9035056830709752060</id><published>2012-05-14T09:51:00.011-04:00</published><updated>2012-05-14T12:00:35.640-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-14T12:00:35.640-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="multiple comparisons" /><category scheme="http://www.blogger.com/atom/ns#" term="p.adjust()" /><category scheme="http://www.blogger.com/atom/ns#" term="false discovery rate" /><category scheme="http://www.blogger.com/atom/ns#" term="rep()" /><category scheme="http://www.blogger.com/atom/ns#" term="Benjamini and Hochberg" /><category scheme="http://www.blogger.com/atom/ns#" term="array statement" /><category scheme="http://www.blogger.com/atom/ns#" term="do loop" /><category scheme="http://www.blogger.com/atom/ns#" term="set statement options" /><category scheme="http://www.blogger.com/atom/ns#" term="rbind()" /><title>Example 9.31: Exploring multiple testing procedures</title><content type="html">In &lt;a href="http://sas-and-r.blogspot.com/2012/05/example-930-addressing-multiple.html"&gt;example 9.30&lt;/a&gt; we explored the effects of adjusting for multiple testing using the Bonferroni and Benjamini-Hochberg (or false discovery rate, FDR) procedures.  At the time we claimed that it would probably be inappropriate to extract the adjusted p-values from the FDR method from their context.  In this entry we attempt to explain our misgivings about this practice.&lt;br /&gt;&lt;br /&gt;The FDR procedure is described in &lt;a href="http://www.math.tau.ac.il/~ybenja/MyPapers/benjamini_hochberg1995.pdf"&gt;Benjamini and Hochberg&lt;/a&gt; (JRSSB, 1995) as a "step-down" procedure.  Put simply, the procedure has the following steps:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;0. Choose the familywise alpha&lt;br /&gt;1. Rank order the unadjusted p-values&lt;br /&gt;2. Beginning with the Mth of the ordered p-values p(m), &lt;br /&gt;2a.    if p(m) &lt; alpha*(m/M), then reject all tests 1 ... m, &lt;br /&gt;2b.    if not, m = m-1&lt;br /&gt;3. Repeat steps 2a and 2b until the condition is met &lt;br /&gt;               or p(1) &gt; alpha/M&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;where M is the number of tests.  The "adjusted p-value" based on this procedure is the smallest familywise alpha under which the current test would have been rejected.  To calculate this, we can modify the routine above:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;1. Rank order the unadjusted p-values&lt;br /&gt;2. For ordered p-values p(m) M to 1, &lt;br /&gt;2a.    candidate ap(m) = p(m) *(M/m) &lt;br /&gt;2b.    if candidate ap(m) &gt; ap(m+1) then ap(m) = ap(m+1)&lt;br /&gt;2c.    else ap(m) = candidate ap(m)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;where ap(m) refers to the adjusted p-value corresponding to the mth ordered unadjusted p-value.  It's interesting to note that the adjusted p-value for the Mth ordered test is the same as the unadjusted p-value, while the candidate adjusted p-value for the smallest test is the Bonferroni adjusted p-value.  The primary difficulty with taking these p-values (as opposed to the test results) out of context is captured in steps 2b and 2c.  They imply that the p-value for a given test may be &lt;i&gt;lowered&lt;/i&gt; by other observed p-values in the family of tests.  It's also true that the adjusted p-value depends on the number of tests included in the family, but this seems somewhat less troubling.&lt;br /&gt;&lt;br /&gt;To examine the impact of the procedure on the adjusted p-values for the individual tests, we'll compare the candidate ap(m) from step 2a against the actual ap(m).  Our sense is that to the degree these are different, the adjusted p-value should not be extracted from the context of the observed family of tests.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;Our SAS code relies heavily on the &lt;tt&gt;array&lt;/tt&gt; statement (section 1.11.5).  We loop through the p-values from largest to smallest, calculating the candidate fdr p-value as above, before arriving at the final adjusted p-value.  To compare the values conveniently, we make a new data set with two copies of the original data set, renaming first the candidate and then the adjusted p-values to have the same names.  The &lt;tt&gt;in =&lt;/tt&gt; data set option creates a temporary variable which identifies which data set an observation was read from; here it denotes which version of the same data set (and which set of p-values) was used.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data fdr;&lt;br /&gt;array pvals [10] pval1 - pval10 &lt;br /&gt;     (.001 .001 .001 .001 .001 .03 .035 .04 .05 .05);&lt;br /&gt;array cfdrpvals [10] cfdr1 - cfdr10;&lt;br /&gt;array fdrpvals [10] fdr1 - fdr10;&lt;br /&gt;fdrpvals[10] = pvals[10];&lt;br /&gt;do i = 9 to 1 by -1;&lt;br /&gt;  cfdrpvals[i] = pvals[i] * 10/i;&lt;br /&gt;  if cfdrpvals[i] &gt; fdrpvals[i+1] then fdrpvals[i] = fdrpvals[i+1];&lt;br /&gt;  else fdrpvals[i] = cfdrpvals[i];&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;data compare;&lt;br /&gt;set fdr (in = cfdr rename=(cfdr1=c1 cfdr2=c2 cfdr3=c3 cfdr4=c4 &lt;br /&gt;           cfdr5=c5 cfdr6=c6 cfdr7=c7 cfdr8=c8 cfdr9=c9)) &lt;br /&gt;    fdr (in = fdr rename=(fdr1=c1 fdr2=c2 fdr3=c3 fdr4=c4 fdr5=c5 &lt;br /&gt;           fdr6=c6 fdr7=c7 fdr8=c8 fdr9=c9));&lt;br /&gt;if cfdr then adjustment = "Candidate fdr";&lt;br /&gt;if fdr then adjustment = "Final fdr";&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc print data = compare; var adjustment c1-c9; run;&lt;br /&gt;&lt;br /&gt;adjustment       c1    c2    c3     c4    c5     c6    c7    c8    c9&lt;br /&gt;&lt;br /&gt;Candidate fdr   0.010  .005  .0033  .0025  .002  .05   .05   .05   .055&lt;br /&gt;Final fdr       0.002  .002  .0020  .0020  .002  .05   .05   .05   .050&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;(We omit the last p-value because the adjustment does not affect it.) The result shows that for many of the tests in this family, a substantially smaller p-value is obtained with the final FDR p-value than the candidate.  To this degree, the FDR p-value is dependent on the observed values of the p-values in the tests in the family, and ought not to be removed from the context of these other tests.  We would recommend caution in displaying the FDR p-values in such settings, given readers' propensity to use them as if they were ordinary p-values, safely adjusted for multiple testing.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;Comparison of the R and SAS code may make SAS programmers weep.  The candidate values are easily calculated, and can be presented with the final p-values in one step using the &lt;tt&gt;p.adjust()&lt;/tt&gt; function.  Three lines of code, albeit incorporating multiple functions in each line.  (And it could sensibly be done in two, calculating the candidate p-values within the &lt;tt&gt;rbind()&lt;/tt&gt; function call.)   Note especially the line calculating the candidate p-values, in which vectorization allows a for loop to be avoided in a very natural fashion.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;fakeps = c(rep(.2, 5), 6, 7, 8, 10, 10)/200&lt;br /&gt;cfdr = fakeps * 10/(1:10)&lt;br /&gt;rbind(cfdr, fdr=p.adjust(fakeps, "fdr"))[,1:9]&lt;br /&gt;&lt;br /&gt;      [,1]  [,2]   [,3]   [,4]  [,5] [,6] [,7] [,8]   [,9] [,10]&lt;br /&gt;cfdr 0.010 0.005 0.0033 0.0025 0.002 0.05 0.05 0.05 0.0556  0.05&lt;br /&gt;fdr  0.002 0.002 0.0020 0.0020 0.002 0.05 0.05 0.05 0.0500  0.05&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=oBF-Ipn97X0:Z4MI6rmBkJQ:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=oBF-Ipn97X0:Z4MI6rmBkJQ:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=oBF-Ipn97X0:Z4MI6rmBkJQ:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/oBF-Ipn97X0" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/9035056830709752060/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=9035056830709752060" title="5 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/9035056830709752060?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/9035056830709752060?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/oBF-Ipn97X0/example-931-exploring-multiple-testing.html" title="Example 9.31: Exploring multiple testing procedures" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><thr:total>5</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/05/example-931-exploring-multiple-testing.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ak4EQX8zcCp7ImA9WhVVE0Q.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-872113636495893376</id><published>2012-05-07T09:55:00.015-04:00</published><updated>2012-05-07T09:55:00.188-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-07T09:55:00.188-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="proc multtest" /><category scheme="http://www.blogger.com/atom/ns#" term="multiple comparisons" /><category scheme="http://www.blogger.com/atom/ns#" term="p.adjust() function" /><category scheme="http://www.blogger.com/atom/ns#" term="false discovery rate" /><category scheme="http://www.blogger.com/atom/ns#" term="matplot()" /><category scheme="http://www.blogger.com/atom/ns#" term="Benjamini-Hochberg" /><category scheme="http://www.blogger.com/atom/ns#" term="Hochberg procedure" /><category scheme="http://www.blogger.com/atom/ns#" term="Bonferroni procedure" /><title>Example 9.30: addressing multiple comparisons</title><content type="html">&lt;a href="http://3.bp.blogspot.com/-bH02R3uu-Ok/T5gNMHdEUEI/AAAAAAAAAIs/FkW7Fdi7ges/s1600/Rplot.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 480px; height: 282px;" src="http://3.bp.blogspot.com/-bH02R3uu-Ok/T5gNMHdEUEI/AAAAAAAAAIs/FkW7Fdi7ges/s1600/Rplot.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5735348627266883650" /&gt;&lt;/a&gt;&lt;br /&gt;We've been more sensitive to accounting for &lt;a href="http://en.wikipedia.org/wiki/Multiple_comparisons"&gt;multiple comparisons&lt;/a&gt; recently, in part due to &lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/21185405"&gt;work&lt;/a&gt; that Nick and colleagues published on the topic.  &lt;br /&gt;&lt;br /&gt;In this entry, we consider results from a &lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/19752409"&gt;randomized trial&lt;/a&gt; (Kypri et al., 2009) to reduce problem drinking in Australian university students.  &lt;br /&gt;Seven outcomes were pre-specified: three designated as primary and four as secondary.  No adjustment for multiple comparisons was undertaken.  The p-values were given as 0.001, 0.001 for the primary outcomes and 0.02 and .001, .22, .59 and .87 for the secondary outcomes.  &lt;br /&gt;In this entry, we detail how to adjust for multiplicity using R and SAS.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The &lt;tt&gt;p.adjust()&lt;/tt&gt; function in R calculates a variety of different approaches for multiplicity adjustments given a vector of p-values.  These include the Bonferroni procedure (where the alpha is divided by the number of tests or equivalently the p-value is multiplied by that number, and truncated back to 1 if the result is not a probability). Other, less conservative corrections are also included (these are Holm (1979), Hochberg (1988), Hommel (1988), Benjamini and Hochberg (1995) and Benjamini and Yekutieli (2001)).  The first four methods provide strong control for the family-wise error rate and all dominate the Bonferroni procedure.  Here we compare the results from the unadjusted, Benjamini and Hochberg &lt;tt&gt;method="BH"&lt;/tt&gt; and Bonferroni procedure for the Kypri et al. study.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;pvals = c(.001, .001, .001, .02, .22, .59, .87)&lt;br /&gt;BONF = p.adjust(pvals, "bonferroni")&lt;br /&gt;BH = p.adjust(pvals, "BH")&lt;br /&gt;res = cbind(pvals, BH=round(BH, 3), BONF=round(BONF, 3))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;This yields the following results:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;    pvals    BH  BONF&lt;br /&gt;[1,] 0.001 0.002 0.007&lt;br /&gt;[2,] 0.001 0.002 0.007&lt;br /&gt;[3,] 0.001 0.002 0.007&lt;br /&gt;[4,] 0.020 0.035 0.140&lt;br /&gt;[5,] 0.220 0.308 1.000&lt;br /&gt;[6,] 0.590 0.688 1.000&lt;br /&gt;[7,] 0.870 0.870 1.000&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The only substantive difference between the three sets of unadjusted and adjusted p-values is seen for the 4th most significant outcome, which remains statistically significant at the alpha=0.05 level for all but the Bonferroni procedure.&lt;br /&gt;&lt;br /&gt;It is straightforward to graphically display these results (as seen above):&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;matplot(res, ylab="p-values", xlab="sorted outcomes")&lt;br /&gt;abline(h=0.05, lty=2)&lt;br /&gt;matlines(res)&lt;br /&gt;legend(1, .9, legend=c("Bonferroni", "Benjamini-Hochberg", "Unadjusted"), &lt;br /&gt;  col=c(3, 2, 1), lty=c(3, 2, 1), cex=0.7)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;It bears mentioning here that the Benjamini-Hochberg procedure really only make sense in the gestalt.  That is, it would probably be incorrect to take the adjusted p-values from above and remove them from the context of the 7 tests performed here.  The correct use (as with all tests) is to pre-specify the alpha level, and reject tests with p-values that are smaller.  What &lt;tt&gt;p.adjust()&lt;/tt&gt; reports is the smallest family-wise alpha error under which each of the tests would result in a rejection of the null hypothesis.  But the nature of the Benjamini-Hochberg procedure is that this value may well depend on the other observed p-values.  We will explore this further in a later entry.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;The &lt;tt&gt;multtest&lt;/tt&gt; procedure will perform a number of multiple testing procedures.  It works with raw data for ANOVA models, and can also accept a list of p-values as shown here.  (Note that "FDR" (false discovery rate) is the name that Benjamini and Hochberg give to their procedure and that this nomenclature is used by SAS.)  Various other procedures can do some adjustment through, e.g., the &lt;tt&gt;estimate&lt;/tt&gt; statement, but &lt;tt&gt;multtest&lt;/tt&gt; is the most flexible.  A plot similar to that created in R is shown below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data a;&lt;br /&gt;   input Test$ Raw_P @@;&lt;br /&gt;   datalines;&lt;br /&gt;test01  0.001    test02  0.001    test03  0.001&lt;br /&gt;test04  0.02    test05  0.22    test06  0.59&lt;br /&gt;test07  0.87&lt;br /&gt;;&lt;br /&gt;&lt;br /&gt;proc multtest inpvalues=a bon fdr plots=adjusted(unpack);&lt;br /&gt;run;&lt;br /&gt;                                                     False&lt;br /&gt;                                                 Discovery&lt;br /&gt;            Test           Raw    Bonferroni          Rate&lt;br /&gt;&lt;br /&gt;               1        0.0010        0.0070        0.0023&lt;br /&gt;               2        0.0010        0.0070        0.0023&lt;br /&gt;               3        0.0010        0.0070        0.0023&lt;br /&gt;               4        0.0200        0.1400        0.0350&lt;br /&gt;               5        0.2200        1.0000        0.3080&lt;br /&gt;               6        0.5900        1.0000        0.6883&lt;br /&gt;               7        0.8700        1.0000        0.8700&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://1.bp.blogspot.com/-g_r6CSu2T-w/T5m73WDk9jI/AAAAAAAADiw/nhWkIIxFo0k/s1600/AdjByRawRank.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 640px; height: 480px;" src="http://1.bp.blogspot.com/-g_r6CSu2T-w/T5m73WDk9jI/AAAAAAAADiw/nhWkIIxFo0k/s1600/AdjByRawRank.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5735822159920821810" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators:&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;pre&gt;&lt;/pre&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wgI2a6YEE0s:3gknF6w-NZA:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wgI2a6YEE0s:3gknF6w-NZA:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wgI2a6YEE0s:3gknF6w-NZA:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/wgI2a6YEE0s" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/872113636495893376/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=872113636495893376" title="3 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/872113636495893376?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/872113636495893376?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/wgI2a6YEE0s/example-930-addressing-multiple.html" title="Example 9.30: addressing multiple comparisons" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-bH02R3uu-Ok/T5gNMHdEUEI/AAAAAAAAAIs/FkW7Fdi7ges/s72-c/Rplot.png" height="72" width="72" /><thr:total>3</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/05/example-930-addressing-multiple.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DEMAQX4yfyp7ImA9WhVWF0Q.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-5375707955074537724</id><published>2012-04-30T10:34:00.000-04:00</published><updated>2012-04-30T10:34:00.097-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-30T10:34:00.097-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Richard Heiberger" /><category scheme="http://www.blogger.com/atom/ns#" term="lapply() function" /><category scheme="http://www.blogger.com/atom/ns#" term="ifelse()" /><category scheme="http://www.blogger.com/atom/ns#" term="apply family of functions" /><category scheme="http://www.blogger.com/atom/ns#" term="outer() function" /><category scheme="http://www.blogger.com/atom/ns#" term="R-sig-teaching" /><category scheme="http://www.blogger.com/atom/ns#" term="R Inferno" /><category scheme="http://www.blogger.com/atom/ns#" term="array statement" /><category scheme="http://www.blogger.com/atom/ns#" term="output statement" /><category scheme="http://www.blogger.com/atom/ns#" term="looping" /><category scheme="http://www.blogger.com/atom/ns#" term="Michael Weylandt" /><title>Example 9.29: the perils of for loops</title><content type="html">A recent exchange on the &lt;a href="https://stat.ethz.ch/mailman/listinfo/r-sig-teaching"&gt;R-sig-teaching&lt;/a&gt; list featured a discussion of how best to teach new students R.  The initial post included an exercise to write a function, that given a n, will draw n rows of a triangle made up of "*", noting that for a beginner, this may require two for loops.  For example, in pseudo-code:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;for i = 1 to n&lt;br /&gt;  for j = 1 to i&lt;br /&gt;    print "*"&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Unfortunately, as several folks (including &lt;a href="http://astro.temple.edu/~rmh/"&gt;Richard M. Heiberger&lt;/a&gt; and R. Michael Weylandt) noted, for loops in general are not the best way to take full advantage of R.  In this entry, we review two solutions they proposed which fit within the R philosophy.&lt;br /&gt;&lt;br /&gt;Richard's solution uses the &lt;tt&gt;outer()&lt;/tt&gt; function to generate a 5x5 matrix of logical values indicating whether the column number is bigger than the row number.  Next the &lt;tt&gt;ifelse()&lt;/tt&gt; function is used to replace &lt;tt&gt;TRUE&lt;/tt&gt; with &lt;tt&gt;*&lt;/tt&gt;.  &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; ifelse(outer(1:5, 1:5, `&gt;=`), "*", " ")  &lt;br /&gt;     [,1] [,2] [,3] [,4] [,5]&lt;br /&gt;[1,] "*"  " "  " "  " "  " " &lt;br /&gt;[2,] "*"  "*"  " "  " "  " " &lt;br /&gt;[3,] "*"  "*"  "*"  " "  " " &lt;br /&gt;[4,] "*"  "*"  "*"  "*"  " " &lt;br /&gt;[5,] "*"  "*"  "*"  "*"  "*" &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Michael's solution uses the &lt;tt&gt;lapply()&lt;/tt&gt; function to call a function repeatedly for different values of &lt;tt&gt;n&lt;/tt&gt;.  This returns a list rather than a matrix, but accomplishes the same task.  &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; lapply(1:5, function(x) cat(rep("*", x), "\n"))&lt;br /&gt;* &lt;br /&gt;* * &lt;br /&gt;* * * &lt;br /&gt;* * * * &lt;br /&gt;* * * * * &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;While this exercise is of little practical value, it does illustrate some important points, and provides a far more efficient as well as elegant way of accomplishing the tasks.  For those interested in more, another resource is the &lt;a href="http://www.burns-stat.com/pages/Tutor/R_inferno.pdf"&gt;R Inferno&lt;/a&gt; project of &lt;a href="http://burns-stat.com/"&gt;Patric Burns&lt;/a&gt;.  &lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;We demonstrate a SAS data step solution mainly to call out some useful features and cautions.  In all likelihood a &lt;tt&gt;proc iml&lt;/tt&gt; matrix-based solution would be more elegant;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data test;&lt;br /&gt;array star [5] $ star1 - star5;&lt;br /&gt;do i = 1 to 5;&lt;br /&gt;  star[i] = "*";&lt;br /&gt;  output;&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc print noobs; var star1 - star5; run;&lt;br /&gt;&lt;br /&gt;             star1    star2    star3    star4    star5&lt;br /&gt;&lt;br /&gt;               *&lt;br /&gt;               *        *&lt;br /&gt;               *        *        *&lt;br /&gt;               *        *        *        *&lt;br /&gt;               *        *        *        *        *&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;In particular, note the &lt;tt&gt;$&lt;/tt&gt; in the &lt;tt&gt;array&lt;/tt&gt; statement, which allows the variables to contain characters; by default variables created by an &lt;tt&gt;array&lt;/tt&gt; statement are numeric.  In addition, note the reference to a sequentially suffixed list of variables using the single hyphen shortcut; this would help in generalizing to n rows.  Finally, note that we were able to avoid a second &lt;tt&gt;do&lt;/tt&gt; loop (SAS' primary iterative looping syntax) mainly by luck-- the most recently generated value of a variable is saved by default.  This can cause trouble, in general, but here it keeps all the previous "*"s when moving on to the next row.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;pre&gt;&lt;/pre&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=4YZ9UJKJmeg:qS9-_-FLhAI:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=4YZ9UJKJmeg:qS9-_-FLhAI:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=4YZ9UJKJmeg:qS9-_-FLhAI:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/4YZ9UJKJmeg" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/5375707955074537724/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=5375707955074537724" title="5 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5375707955074537724?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5375707955074537724?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/4YZ9UJKJmeg/example-929-perils-of-for-loops.html" title="Example 9.29: the perils of for loops" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><thr:total>5</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/04/example-929-perils-of-for-loops.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkMAQXkzeyp7ImA9WhVWEUQ.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-5977076940527286180</id><published>2012-04-23T10:14:00.000-04:00</published><updated>2012-04-23T10:14:00.783-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-23T10:14:00.783-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="unaggregated datasets" /><category scheme="http://www.blogger.com/atom/ns#" term="aggregated datasets" /><category scheme="http://www.blogger.com/atom/ns#" term="weight statement" /><category scheme="http://www.blogger.com/atom/ns#" term="rep()" /><category scheme="http://www.blogger.com/atom/ns#" term="output statement" /><category scheme="http://www.blogger.com/atom/ns#" term="expand.table()" /><title>Example 9.28: creating datasets from tables</title><content type="html">&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;There are often times when it is useful to create an individual level dataset from aggregated data (such as a table).  While this can be done using the &lt;tt&gt;expand.table()&lt;/tt&gt; function within the &lt;tt&gt;epitools&lt;/tt&gt; package, it is also straightforward to do directly within R.&lt;br /&gt;&lt;br /&gt;Imagine that instead of the individual level data, we had only the 2x2 table for the association between homeless status and gender within the HELP RCT:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; HELPrct = read.csv("http://www.math.smith.edu/r/data/help.csv")&lt;br /&gt;&gt; xtabs(~ homeless + female, data=HELPrct)&lt;br /&gt;        female&lt;br /&gt;homeless   0   1&lt;br /&gt;       0 177  67&lt;br /&gt;       1 169  40&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;We can use this information to create an analytic dataset using just the four rows of a new dataset:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; female = c(0, 1, 0, 1)&lt;br /&gt;&gt; homeless = c(1, 1, 0, 0)&lt;br /&gt;&gt; count = c(169, 40, 177, 67)&lt;br /&gt;&gt; ds=data.frame(cbind(female, homeless, count))&lt;br /&gt;&gt; ds&lt;br /&gt;  female homeless count&lt;br /&gt;1      0        1   169&lt;br /&gt;2      1        1    40&lt;br /&gt;3      0        0   177&lt;br /&gt;4      1        0    67&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Next we use the &lt;tt&gt;rep()&lt;/tt&gt; function to generate a vector of indices to repeat.  The &lt;tt&gt;index&lt;/tt&gt; object repeats each row number &lt;tt&gt;count&lt;/tt&gt; times.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;&gt; index = rep(seq_len(nrow(ds)), times=ds$count)&lt;br /&gt;&gt; newds = ds[index,]&lt;br /&gt;&gt; newds$count = NULL&lt;br /&gt;&gt; xtabs(~ homeless + female, data=newds)&lt;br /&gt;        female&lt;br /&gt;homeless   0   1&lt;br /&gt;       0 177  67&lt;br /&gt;       1 169  40&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The resulting data set is identical to the summarized input data set.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;Many SAS procedures offer a &lt;tt&gt;weight &lt;i&gt;varname&lt;/i&gt;&lt;/tt&gt; option (as a statement within the proc) which will duplicate each observation &lt;i&gt;varname&lt;/i&gt; times.  So, for example, we can make a data set such as that shown above, then use, e.g., &lt;tt&gt;proc freq&lt;/tt&gt; to produce a table.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data ds;&lt;br /&gt;female = 0; homeless = 1; count = 169; output;&lt;br /&gt;female = 1; homeless = 1; count = 40; output;&lt;br /&gt;female = 0; homeless = 0; count = 177; output;&lt;br /&gt;female = 1; homeless = 0; count = 67; output;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc freq data = ds;&lt;br /&gt;table homeless * female;&lt;br /&gt;weight count;&lt;br /&gt;run;&lt;br /&gt;                homeless     female&lt;br /&gt;&lt;br /&gt;                Frequency|&lt;br /&gt;                Percent  |&lt;br /&gt;                Row Pct  |&lt;br /&gt;                Col Pct  |       0|       1|  Total&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                       0 |    177 |     67 |    244&lt;br /&gt;                         |  39.07 |  14.79 |  53.86&lt;br /&gt;                         |  72.54 |  27.46 |&lt;br /&gt;                         |  51.16 |  62.62 |&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                       1 |    169 |     40 |    209&lt;br /&gt;                         |  37.31 |   8.83 |  46.14&lt;br /&gt;                         |  80.86 |  19.14 |&lt;br /&gt;                         |  48.84 |  37.38 |&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                Total         346      107      453&lt;br /&gt;                            76.38    23.62   100.00&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;However, some procedures lack this option, and/or it may be difficult to arrange your data appropriately to take advantage of it.  In such cases, it's useful to be able to expand the data manually, as we show for R above.  We demonstrate this below, assuming the &lt;tt&gt;count&lt;/tt&gt; variable can be constructed.  The explicit &lt;tt&gt;output&lt;/tt&gt; statement puts a line into the &lt;tt&gt;newds&lt;/tt&gt; data set &lt;tt&gt;count&lt;/tt&gt; times.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;data newds;&lt;br /&gt;set ds;&lt;br /&gt;do i = 1 to count;&lt;br /&gt;  output;&lt;br /&gt;  end;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;proc freq data = newds;&lt;br /&gt;table homeless * female;&lt;br /&gt;run;&lt;br /&gt;                homeless     female&lt;br /&gt;&lt;br /&gt;                Frequency|&lt;br /&gt;                Percent  |&lt;br /&gt;                Row Pct  |&lt;br /&gt;                Col Pct  |       0|       1|  Total&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                       0 |    177 |     67 |    244&lt;br /&gt;                         |  39.07 |  14.79 |  53.86&lt;br /&gt;                         |  72.54 |  27.46 |&lt;br /&gt;                         |  51.16 |  62.62 |&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                       1 |    169 |     40 |    209&lt;br /&gt;                         |  37.31 |   8.83 |  46.14&lt;br /&gt;                         |  80.86 |  19.14 |&lt;br /&gt;                         |  48.84 |  37.38 |&lt;br /&gt;                ---------+--------+--------+&lt;br /&gt;                Total         346      107      453&lt;br /&gt;                            76.38    23.62   100.00&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;pre&gt;&lt;/pre&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=ucHM7FsvRX8:j_JiMvLNoDU:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=ucHM7FsvRX8:j_JiMvLNoDU:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=ucHM7FsvRX8:j_JiMvLNoDU:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/ucHM7FsvRX8" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/5977076940527286180/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=5977076940527286180" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5977076940527286180?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5977076940527286180?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/ucHM7FsvRX8/example-928-creating-datasets-from.html" title="Example 9.28: creating datasets from tables" /><author><name>Nick Horton</name><uri>http://www.blogger.com/profile/00242216324355342047</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="25" height="32" src="http://1.bp.blogspot.com/_Y7yEfDrPG4o/SjJwk69Z2tI/AAAAAAAAAAM/KoYswr20tck/S220/nh2.jpg" /></author><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/04/example-928-creating-datasets-from.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0AEQX87fCp7ImA9WhVXFUU.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-5848966660913069347</id><published>2012-04-16T10:15:00.001-04:00</published><updated>2012-04-16T10:15:00.104-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-16T10:15:00.104-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Stein estimator" /><category scheme="http://www.blogger.com/atom/ns#" term="text()" /><category scheme="http://www.blogger.com/atom/ns#" term="axis statement" /><category scheme="http://www.blogger.com/atom/ns#" term="baseball" /><category scheme="http://www.blogger.com/atom/ns#" term="matrix()" /><category scheme="http://www.blogger.com/atom/ns#" term="match() function" /><category scheme="http://www.blogger.com/atom/ns#" term="matlines()" /><category scheme="http://www.blogger.com/atom/ns#" term="sports statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="pointlabel option" /><category scheme="http://www.blogger.com/atom/ns#" term="James-Stein estimator" /><category scheme="http://www.blogger.com/atom/ns#" term="matplot()" /><category scheme="http://www.blogger.com/atom/ns#" term="title statement" /><category scheme="http://www.blogger.com/atom/ns#" term="shrinkage estimator" /><title>Example 9.27: Baseball and shrinkage</title><content type="html">&lt;a href="http://2.bp.blogspot.com/-Kti93ghFh2k/T4h2MJT_llI/AAAAAAAADfs/a9-v4mY2f0M/s1600/JS%2Bplot.bmp"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 387px;" src="http://2.bp.blogspot.com/-Kti93ghFh2k/T4h2MJT_llI/AAAAAAAADfs/a9-v4mY2f0M/s1600/JS%2Bplot.bmp" border="0" alt=""id="BLOGGER_PHOTO_ID_5730960476859242066" /&gt;&lt;/a&gt;&lt;br /&gt;To celebrate the beginning of the professional baseball season here in the US and Canada, we revisit a famous example of using baseball data to demonstrate statistical properties.  &lt;br /&gt;&lt;br /&gt;In 1977, Bradley Efron and Carl Morris published a paper about the James-Stein estimator-- the shrinkage estimator that has better mean squared error than the simple average.  Their prime example was the batting averages of 18 player in the 1970 season: they considered trying to estimate the players' average over the remainder of the season, based on their first 45 at-bats.  The paper is a pleasure to read, and can be downloaded &lt;a href="http://www-stat.stanford.edu/~ckirby/brad/other/Article1977.pdf"&gt;here&lt;/a&gt;.  The data are available &lt;a href="http://www.swarthmore.edu/NatSci/peverso1/Sports%20Data/JamesSteinData/Efron-Morris%20Baseball/EfronMorrisBB.txt"&gt;here&lt;/a&gt;, on the pages of Statistician &lt;a href="http://www.swarthmore.edu/NatSci/peverso1/"&gt;Phil Everson&lt;/a&gt;, of Swarthmore College.&lt;br /&gt;&lt;br /&gt;Today we'll review plotting the data, and intend to look at some other shrinkage estimators in a later entry.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;SAS&lt;/b&gt;&lt;br /&gt;We begin by reading in the data for Everson's page.  (Note the long address would need to be on one line, or you could could use a URL shortener like TinyURL.com.  To read the data, we use the &lt;tt&gt;infile&lt;/tt&gt; statement to indicate a tab-delimited file and to say that the data begin in row 2.  The &lt;tt&gt;informat&lt;/tt&gt; statement helps read in the variable-length last names. &lt;br /&gt;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;filename bb url "http://www.swarthmore.edu/NatSci/peverso1/Sports%20Data/&lt;br /&gt;    JamesSteinData/Efron-Morris%20Baseball/EfronMorrisBB.txt";&lt;br /&gt;&lt;br /&gt;data bball;&lt;br /&gt;infile bb delimiter='09'x MISSOVER DSD lrecl=32767 firstobs=2 ;&lt;br /&gt;informat firstname $7. lastname $10.;&lt;br /&gt;input FirstName $ LastName $ AtBats Hits BattingAverage RemainingAtBats&lt;br /&gt;   RemainingAverage SeasonAtBats SeasonHits SeasonAverage;&lt;br /&gt;run;&lt;br /&gt;&lt;br /&gt;data bballjs;&lt;br /&gt;set bball;&lt;br /&gt;js = .212 * battingaverage + .788 * .265;&lt;br /&gt;&lt;br /&gt;avg = battingaverage; time = 1; &lt;br /&gt;  if lastname not in("Scott","Williams", "Rodriguez", "Unser","Swaboda","Spencer") &lt;br /&gt;    then name = lastname; else name = ''; &lt;br /&gt;output;&lt;br /&gt;avg = seasonaverage; name = ''; time = 2; output;&lt;br /&gt;avg = js; time = 3; name = ''; output;&lt;br /&gt;run;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;In the second &lt;tt&gt;data&lt;/tt&gt; step, we calculate the James-Stein estimator according to the values reported in the paper.  Then, to facilitate plotting, we convert the data to the "long" format, with three rows for each player, using the explicit &lt;tt&gt;output&lt;/tt&gt; statement.  The average in the first 45 at-bats, the average in the remainder of the season, and the James-Stein estimator are recorded in the same variable in each of the three rows, respectively. To distinguish between the rows, we assign a different value of &lt;tt&gt;time&lt;/tt&gt;: this will be used to order the values on the graphic.  We also record the last name of (most of) the players in a new variable, but only in one of the rows.  This will be plotted in the graphic-- some players' names can't be shown without plotting over the data or other players' names.&lt;br /&gt;&lt;br /&gt;Now we can generate the plot.  Many features shown here have been demonstrated in several entries.  We call out 1) the &lt;tt&gt;h&lt;/tt&gt; option, which increases the text size in the titles and labels, 2) the &lt;tt&gt;offset&lt;/tt&gt; option, which moves the data away from the edge of the plot frame, 3) the &lt;tt&gt;value&lt;/tt&gt; option in the &lt;tt&gt;axis&lt;/tt&gt; statement, which replaces the values of "time" with descriptive labels, and 4) the handy a*b=c syntax which replicates the plot for each player.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;title h=3 "Efron and Morris example of James-Stein estimation";&lt;br /&gt;title2 h=2 "Baseball players' 1970 performance estimated from first 45 at-bats";&lt;br /&gt;axis1 offset = (4cm,1cm) minor=none label=none&lt;br /&gt;  value = (h = 2 "Avg. of first 45" "Avg. of remainder" "J-S Estimator");&lt;br /&gt;axis2 order = (.150 to .400 by .050) minor=none offset=(0.5cm,1.5cm) &lt;br /&gt;  label = (h =2 r=90 a = 270 "Batting Average");&lt;br /&gt;symbol1 i = j v = none l = 1 c = black r = 20 w=3 &lt;br /&gt;  pointlabel = (h=2 j=l position = middle "#name");&lt;br /&gt;&lt;br /&gt;proc gplot data = bballjs;&lt;br /&gt;  plot avg * time = lastname / haxis = axis1 vaxis = axis2 nolegend;&lt;br /&gt;run; quit;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;To read the plot (shown at the top) consider approaching the nominal true probability of a hit, as represented by the average over the remainder of the season, in the center.  If you begin on the left, you see the difference associated with using the simple average of the first 45 at-bats as the estimator.  Coming from the right, you see the difference associated withe using the James-Stein shrinkage estimator.  The improvement associated with the James-Stein estimator is reflected in the generally shallower slopes when coming from the left.  With the exception of Pirates great &lt;a href="http://en.wikipedia.org/wiki/Roberto_Clemente"&gt;Roberto Clemente&lt;/a&gt; and declining third-baseman &lt;a href="http://en.wikipedia.org/wiki/Max_Alvis"&gt;Max Alvis&lt;/a&gt;, most every line has a shallower slope from the left; James' and Stein's theoretical work proved that overall the lines must be shallower from the right.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;A similar process is undertaken within R.  Once the data are loaded, and a subset of the names are blanked out (to improve the readability of the figure), the &lt;tt&gt;matplot()&lt;/tt&gt; and &lt;tt&gt;matlines()&lt;/tt&gt; functions are used to create the lines.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;bball = read.table("http://www.swarthmore.edu/NatSci/peverso1/Sports%20Data/JamesSteinData/Efron-Morris%20Baseball/EfronMorrisBB.txt",&lt;br /&gt;                   header=TRUE, stringsAsFactors=FALSE)&lt;br /&gt;bball$js = bball$BattingAverage * .212 + .788 * (0.265)&lt;br /&gt;bball$LastName[!is.na(match(bball$LastName, &lt;br /&gt;  c("Scott","Williams", "Rodriguez", "Unser","Swaboda","Spencer")))] = ""&lt;br /&gt;&lt;br /&gt;a = matrix(rep(1:3, nrow(bball)), 3, nrow(bball))&lt;br /&gt;b = matrix(c(bball$BattingAverage, bball$SeasonAverage, bball$js), &lt;br /&gt;   3, nrow(bball), byrow=TRUE)&lt;br /&gt;matplot(a, b, pch=" ", ylab="predicted average", xaxt="n", xlim=c(0.5, 3.1), ylim=c(0.13, 0.42))&lt;br /&gt;matlines(a, b)&lt;br /&gt;text(rep(0.7, nrow(bball)), bball$BattingAverage, bball$LastName, cex=0.6)&lt;br /&gt;text(1, 0.14, "First 45\nat bats", cex=0.5)&lt;br /&gt;text(2, 0.14, "Average\nof remainder", cex=0.5)&lt;br /&gt;text(3, 0.14, "J-S\nestimator", cex=0.5)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://4.bp.blogspot.com/-IqX-LweF9Qc/T4sG46mvi3I/AAAAAAAAAIM/hhav09e1j6M/s1600/Rplot.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 387px;" src="http://4.bp.blogspot.com/-IqX-LweF9Qc/T4sG46mvi3I/AAAAAAAAAIM/hhav09e1j6M/s1600/Rplot.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5731682525633284978" /&gt;&lt;/a&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wk0c0s72_UM:U_ScHCR95lc:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wk0c0s72_UM:U_ScHCR95lc:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wk0c0s72_UM:U_ScHCR95lc:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/wk0c0s72_UM" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/5848966660913069347/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=5848966660913069347" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5848966660913069347?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/5848966660913069347?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/wk0c0s72_UM/example-927-baseball-and-shrinkage.html" title="Example 9.27: Baseball and shrinkage" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-Kti93ghFh2k/T4h2MJT_llI/AAAAAAAADfs/a9-v4mY2f0M/s72-c/JS%2Bplot.bmp" height="72" width="72" /><thr:total>4</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/04/example-927-baseball-and-shrinkage.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkcGRX0yfyp7ImA9WhVQGUU.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-198740438431327365</id><published>2012-04-09T10:00:00.011-04:00</published><updated>2012-04-09T10:00:24.397-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-09T10:00:24.397-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="apply()" /><category scheme="http://www.blogger.com/atom/ns#" term="fitted()" /><category scheme="http://www.blogger.com/atom/ns#" term="Rick Wicklin Robert Allison" /><category scheme="http://www.blogger.com/atom/ns#" term="plotrix package" /><category scheme="http://www.blogger.com/atom/ns#" term="logic" /><title>Example 9.26: More circular plotting</title><content type="html">&lt;a href="http://3.bp.blogspot.com/-W4LuMEqlRAI/T3935Uj4aSI/AAAAAAAADeY/Bdx7T9dnARw/s1600/temptrig.png"&gt;&lt;img style="display:block; margin:0px auto 10px; http://www.blogger.com/img/blank.giftext-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://3.bp.blogspot.com/-W4LuMEqlRAI/T3935Uj4aSI/AAAAAAAADeY/Bdx7T9dnARw/s1600/temptrig.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5728429077694015778" /&gt;&lt;/a&gt;&lt;br /&gt;SAS's &lt;a href="http://blogs.sas.com/content/iml/author/rickwicklin/"&gt;Rick Wicklin&lt;/a&gt; &lt;a href="http://blogs.sas.com/content/iml/2012/04/05/smoothers-for-periodic-data/"&gt;showed&lt;/a&gt; a simple loess smoother for the temperature data we showed &lt;a href="http://sas-and-r.blogspot.com/2012/04/example-925-its-been-mighty-warm-winter.html"&gt;here&lt;/a&gt;.  Then he came back with a &lt;a href="http://blogs.sas.com/content/iml/2012/04/06/creating-a-periodic-smoother/"&gt;better approach&lt;/a&gt; that does away with edge effects.  Rick's smoothing was calculated and plotted on a cartesian plane.  In this entry we'll explore another option or two for smoothing, and plot the results on the same circular plot.  &lt;br /&gt;&lt;br /&gt;Since Rick is showing SAS code, and &lt;a href="http://robslink.com/SAS/Home.htm"&gt;Robert Allison&lt;/a&gt; has done the circular plot (&lt;a href="http://robslink.com/SAS/democd55/albany_ny_circular.htm"&gt;plot&lt;/a&gt;) (&lt;a href="http://robslink.com/SAS/democd55/albany_ny_circular.sas"&gt;code&lt;/a&gt;), we'll stick to the R again for this one.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;We'll start out by getting the data and setting it up as we did earlier.  We add the year back into the matrix &lt;tt&gt;t3old&lt;/tt&gt; because it'll be needed later.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;temp1 = read.table("http://academic.udayton.edu/kissock/http/Weather/&lt;br /&gt;           gsod95-current/NYALBANY.txt")&lt;br /&gt;leap = c(0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1)&lt;br /&gt;days = rep(365,18) + leap&lt;br /&gt;monthdays = c(31,28,31,30,31,30,31,31,30,31,30,31)&lt;br /&gt;temp1$V3 = temp1$V3 - 1994&lt;br /&gt;&lt;br /&gt;yearpart = function(daytvec,yeardays,mdays=monthdays){&lt;br /&gt;  part = (sum(mdays[1:(daytvec[1]-1)],(daytvec[1] &gt; 2) * (yeardays[daytvec[3]]==366)) &lt;br /&gt;          + daytvec[2] - ((daytvec[1] == 1)*31)) / yeardays[daytvec[3]]&lt;br /&gt;  return(part)&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;temp2 = as.matrix(temp1)&lt;br /&gt;&lt;br /&gt;radians = 2* pi * apply(temp2, 1, yearpart, days, monthdays)&lt;br /&gt;&lt;br /&gt;t3old = matrix(c(temp1$V4[temp1$V4 != -99 &amp; ((temp1$V3 &lt; 18) )],&lt;br /&gt;                 radians[temp1$V4 != -99  &amp;  ((temp1$V3 &lt; 18) )],&lt;br /&gt;                 temp1$V3[temp1$V4 != -99 &amp; ((temp1$V3 &lt; 18) )]), ncol=3)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;t3now= matrix(c(temp1$V4[temp1$V4 != -99 &amp; ((temp1$V3 == 18) | &lt;br /&gt;                           (temp1$V3 == 17 &amp; temp1$V1 == 12))],&lt;br /&gt;                radians[temp1$V4 != -99 &amp; ((temp1$V3 == 18) | &lt;br /&gt;                           (temp1$V3 == 17 &amp; temp1$V1 == 12))]), ncol=2)&lt;br /&gt;&lt;br /&gt;library(plotrix)&lt;br /&gt;radial.plot(t3old[,1],t3old[,2],rp.type="s", point.col = 2, point.symbols=46, &lt;br /&gt;    clockwise=TRUE, start = pi/2, label.pos = (1:12)/6 * (pi),&lt;br /&gt;    radial.lim=c(-20,10,40,70,100),  labels=c("February 1","March 1",&lt;br /&gt;      "April 1","May 1","June 1","July 1","August 1","September 1",&lt;br /&gt;      "October 1","November 1","December 1","January 1"))&lt;br /&gt;&lt;br /&gt;radial.plot(t3now[,1],t3now[,2],rp.type="s", point.col = 1, point.symbols='*', &lt;br /&gt;    clockwise=TRUE, start = pi/2, add=TRUE, radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;If you didn't happen to see the update on the previous entry, note that the &lt;tt&gt;radial.lim&lt;/tt&gt; option makes the axes for the added points match those for the initial plot.  Otherwise, the added points plotted lower than they appeared, making the recent winter look cooler.&lt;br /&gt;&lt;br /&gt;Rick started with a smoother, but often cyclic data can be fit well parametrically, using the sine and cosine of the cycle length as the covariates.  With the data set up in radians already, this is trivially simple.  The predicted values for the data can be retrieved with the &lt;tt&gt;fitted()&lt;/tt&gt; function (e.g., section 3.7.3), which works with many model objects.  These can then be fed into the &lt;tt&gt;radial.plot()&lt;/tt&gt; function with &lt;tt&gt;rp.type="p"&lt;/tt&gt; to make a line plot.  The result is shown at the top-- the parametric fit appears to do a good job.  Of course, you can fit on a square plot very easily with the &lt;tt&gt;plot()&lt;/tt&gt; function, with result shown below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;simple = lm(t3old[,1] ~ sin(t3old[,2]) + cos(t3old[,2]))&lt;br /&gt;&lt;br /&gt;radial.plot(fitted(simple),t3old[,2],rp.type="p", clockwise=TRUE,&lt;br /&gt;            start = pi/2, add=TRUE, radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;br /&gt;plot(t3old[,1] ~ t3old[,2], pch='.')&lt;br /&gt;lines(t3old[,2],fitted(simple))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://1.bp.blogspot.com/-y2UucBVVNiQ/T3934l1V4uI/AAAAAAAADd0/d5ATemhKqgk/s1600/square.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://1.bp.blogspot.com/-y2UucBVVNiQ/T3934l1V4uI/AAAAAAAADd0/d5ATemhKqgk/s1600/square.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5728429065150784226" /&gt;&lt;/a&gt;&lt;br /&gt;I didn't change the order of the data, so the line comes back to the beginning of the plot at the end of the year.&lt;br /&gt;&lt;br /&gt;Adding a smoothed fit is nearly as easy.  Just replace the &lt;tt&gt;lm()&lt;/tt&gt; call with a &lt;tt&gt;loess()&lt;/tt&gt; (section 5.2.6) call.  The new line is added on top of the old one, to see just how they differ.  The result is show below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;simploess = loess(t3old[,1] ~ t3old[,2])&lt;br /&gt;radial.plot(fitted(simploess),t3old[,2],rp.type="p", line.col="blue", &lt;br /&gt;          clockwise=TRUE, start = pi/2, add=TRUE, radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://3.bp.blogspot.com/-_a2tAUr075o/T39340l6JLI/AAAAAAAADeA/cyHoAfH3Q30/s1600/temptrend.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://3.bp.blogspot.com/-_a2tAUr075o/T39340l6JLI/AAAAAAAADeA/cyHoAfH3Q30/s1600/temptrend.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5728429069112583346" /&gt;&lt;/a&gt;&lt;br /&gt;The parametric fit is pretty good, but misses the sharp dip seen in January, and the fit in the late fall and early spring appear to be slightly affected.&lt;br /&gt;&lt;br /&gt;But this approach stacks up all the data from 18 years.  It might be more appropriate to stretch the data across the calendar time, fit the smoothed line to that, and then wrap it around the circular plot.  To do this, we'll need to add the year back into the radians.  Finding an acceptable smoother was a challenge-- the &lt;tt&gt;smooth.spline()&lt;/tt&gt; function used &lt;a href="http://sas-and-r.blogspot.com/2012/03/example-923-demonstrating-proportional.html"&gt;here&lt;/a&gt; was adequate, but as the second plot below shows, it misses some highs and lows.  Adding the smoothed curve to the plot is as easy as before, however.  The plot with smoothing by year is immediately below.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;radyear = t3old[,2] + (2 * pi * t3old[,3])&lt;br /&gt;better = smooth.spline(y=t3old[,1],x= radyear, all.knots=TRUE,spar=1.1)&lt;br /&gt;&lt;br /&gt;radial.plot(fitted(better),t3old[,2],rp.type="p", line.col="green", &lt;br /&gt;        clockwise=TRUE, start = pi/2, add=TRUE, radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;br /&gt;plot(t3old[,1] ~ radyear, pch = '.')&lt;br /&gt;lines(better)&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;a href="http://1.bp.blogspot.com/-JpZE45YOfRA/T3935J2336I/AAAAAAAADeM/M2Uf6CrRLk4/s1600/temptrend2.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://1.bp.blogspot.com/-JpZE45YOfRA/T3935J2336I/AAAAAAAADeM/M2Uf6CrRLk4/s1600/temptrend2.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5728429074820882338" /&gt;&lt;/a&gt;&lt;br /&gt;The relatively poor fit seen below makes the new (green) line at least as poor as the parametric fit.  The extra variability in the winter is reflected in distinct lines in the winter.  Rick's approach, to fit the data lumping across years, seems to be the best for fitting, though it's easier to see the heteroscedaticity in the ciruclar plot.  But however you slice it, this winter has had an unusual number of very warm days.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://2.bp.blogspot.com/-FntwIyV3Oi0/T3934S9_cOI/AAAAAAAADds/AtS_QjZ4c24/s1600/bettersquare.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://2.bp.blogspot.com/-FntwIyV3Oi0/T3934S9_cOI/AAAAAAAADds/AtS_QjZ4c24/s1600/bettersquare.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5728429060086788322" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;&lt;br /&gt;We love aggregators!  Aggregators are meta-blogs that collect content from blogs that have similar coverage, for the convenience of readers.  For blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=rzNHYAp0C_o:jS8HYjalDrU:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=rzNHYAp0C_o:jS8HYjalDrU:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=rzNHYAp0C_o:jS8HYjalDrU:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/rzNHYAp0C_o" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/198740438431327365/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=198740438431327365" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/198740438431327365?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/198740438431327365?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/rzNHYAp0C_o/example-926-more-circular-plotting.html" title="Example 9.26: More circular plotting" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-W4LuMEqlRAI/T3935Uj4aSI/AAAAAAAADeY/Bdx7T9dnARw/s72-c/temptrig.png" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/04/example-926-more-circular-plotting.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ak4GQ3w_eSp7ImA9WhVQFUU.&quot;"><id>tag:blogger.com,1999:blog-1275149608391671670.post-4776174414004002399</id><published>2012-04-02T09:33:00.009-04:00</published><updated>2012-04-04T21:22:02.241-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-04T21:22:02.241-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="apply()" /><category scheme="http://www.blogger.com/atom/ns#" term="graphics" /><category scheme="http://www.blogger.com/atom/ns#" term="plotrix package" /><category scheme="http://www.blogger.com/atom/ns#" term="read from URL" /><category scheme="http://www.blogger.com/atom/ns#" term="logic" /><title>Example 9.25: It's been a mighty warm winter? (Plot on a circular axis)</title><content type="html">&lt;a href="http://1.bp.blogspot.com/-TJX5QID-DWA/T3zsQLUGM8I/AAAAAAAADdU/ebhawJFKW1c/s1600/tempcorrected.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 600px; height: 400px;" src="http://1.bp.blogspot.com/-TJX5QID-DWA/T3zsQLUGM8I/AAAAAAAADdU/ebhawJFKW1c/s1600/tempcorrected.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5727712588767572930" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;span style="font-weight:bold;"&gt;&lt;br /&gt;Updated (see below)&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;People here in the northeast US consider this to have been an unusually warm winter.  Was it?&lt;br /&gt;&lt;br /&gt;The University of Dayton and the US Environmental Protection Agency maintain an &lt;a href="http://academic.udayton.edu/kissock/http/Weather/citylistUS.htm"&gt;archive of daily average temperatures&lt;/a&gt; that's reasonably current.  In the case of Albany, NY (the most similar of their records to our homes in the Massachusetts' Pioneer Valley), the data set as of this writing includes daily records from 1995 through March 12, 2012.&lt;br /&gt;&lt;br /&gt;In this entry, we show how to use R to plot these temperatures on a circular axis, that is, where January first follows December 31st.  We'll color the current winter differently to see how it compares.  We're not aware of a tool to enable this in SAS.  It would most likely require a bit of algebra and manual plotting to make it work.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;R&lt;/b&gt;&lt;br /&gt;The work of plotting is done by the &lt;tt&gt;radian.plot()&lt;/tt&gt; function in the plotrix package.  But there are a number of data management tasks to be employed first.  Most notably, we need to calculate the relative portion of the year that's elapsed through each day.  This is trickier than it might be, because of leap years.  We'll read the data directly via URL, which we demonstrate in &lt;a href="http://sas-and-r.blogspot.com/2011/03/example-831-choropleth-maps.html"&gt;Example 8.31&lt;/a&gt;.  That way, when the unseasonably warm weather of last week is posted, we can update the plot with trivial ease.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;library(plotrix)&lt;br /&gt;temp1 = read.table("http://academic.udayton.edu/kissock/http/&lt;br /&gt;            Weather/gsod95-current/NYALBANY.txt")&lt;br /&gt;leap = c(0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1)&lt;br /&gt;days = rep(365, 18) + leap&lt;br /&gt;monthdays = c(31,28,31,30,31,30,31,31,30,31,30,31)&lt;br /&gt;temp1$V3 = temp1$V3 - 1994&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The &lt;tt&gt;leap&lt;/tt&gt;, &lt;tt&gt;days&lt;/tt&gt;, and &lt;tt&gt;monthdays&lt;/tt&gt; vectors identify leap years, count the corrrect number of days in each year, and have the number of days in the month in non-leap years, respectively.  We need each of these to get the elapsed time in the year for each day.  The columns in the data set are the month, day, year, and average temperature (in Fahrenheit).  The years are renumbered, since we'll use them as indexes later.&lt;br /&gt;&lt;br /&gt;The &lt;tt&gt;yearpart()&lt;/tt&gt; function, below, counts the proportion of days elapsed.  &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;yearpart = function(daytvec,yeardays,mdays=monthdays){&lt;br /&gt;  part = (sum(mdays[1:(daytvec[1]-1)],&lt;br /&gt;           (daytvec[1] &gt; 2) * (yeardays[daytvec[3]]==366)) &lt;br /&gt;          + daytvec[2] - ((daytvec[1] == 1)*31)) / yeardays[daytvec[3]]&lt;br /&gt;  return(part)&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The &lt;tt&gt;daytvec&lt;/tt&gt; argument to the function will be a row from the data set.  The function works by first summing the days in the months that have passed (&lt;tt&gt;,sum(mdays[1:(daytvec[1]-1)]&lt;/tt&gt;) adding one if it's February and a leap year (&lt;tt&gt;(daytvec[1] &gt; 2) * (yeardays[daytvec[3]]==366))&lt;/tt&gt;).  Then the days passed so far in the current month are added.  Finally, we subtract the length of January, if it's January.  This is needed, because &lt;tt&gt;sum(1:0) = 1&lt;/tt&gt;, the result of which is that that January is counted as a month that has "passed" when the &lt;tt&gt;sum()&lt;/tt&gt; function quoted above is calculated for January days.  Finally, we just divide by the number of days in the current year.&lt;br /&gt;&lt;br /&gt;The rest is fairy simple.  We calculate the radians as the portion of the year passed * 2 * pi, using the &lt;tt&gt;apply()&lt;/tt&gt; function to repeat across the rows of the data set.  Then we make matrices with time before and time since this winter started, admittedly with some ugly logical expressions (section 1.14.11), and use the &lt;tt&gt;radian.plot()&lt;/tt&gt; function to make the plots.  The options to the function are fairly self-explanatory.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;temp2 = as.matrix(temp1)&lt;br /&gt;radians = 2* pi * apply(temp2,1,yearpart,days,monthdays)&lt;br /&gt;&lt;br /&gt;t3old = matrix(c(temp1$V4[temp1$V4 != -99 &amp; ((temp1$V3 &lt; 18) | (temp1$V2 &lt; 12))],&lt;br /&gt;          radians[temp1$V4 != -99  &amp;  ((temp1$V3 &lt; 18) | (temp1$V2 &lt; 2))]),ncol=2)&lt;br /&gt;&lt;br /&gt;t3now= matrix(c(temp1$V4[temp1$V4 != -99 &amp; &lt;br /&gt;          ((temp1$V3 == 18) | (temp1$V3 == 17 &amp; temp1$V1 == 12))],&lt;br /&gt;          radians[temp1$V4 != -99 &amp; ((temp1$V3 == 18) | &lt;br /&gt;          (temp1$V3 == 17 &amp; temp1$V1 == 12))]),ncol=2)&lt;br /&gt;# from plottrix library&lt;br /&gt;radial.plot(t3old[,1],t3old[,2],rp.type="s", point.col = 2, point.symbols=46,&lt;br /&gt;            clockwise=TRUE, start = pi/2, label.pos = (1:12)/6 * (pi), &lt;br /&gt;            labels=c("February 1","March 1","April 1","May 1","June 1",&lt;br /&gt;            "July 1","August 1","September 1","October 1","November 1",&lt;br /&gt;            "December 1","January 1"), radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;br /&gt;radial.plot(t3now[,1],t3now[,2],rp.type="s", point.col = 1, point.symbols='*', &lt;br /&gt;            clockwise=TRUE, start = pi/2, add=TRUE, radial.lim=c(-20,10,40,70,100))&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;The result is shown at the top.  The dots (&lt;tt&gt;point.symbol&lt;/tt&gt; is like &lt;tt&gt;pch&lt;/tt&gt; so 20 is a point (section 5.2.2) show the older data, while the asterisks are the current winter.  An alternate plot can be created with the &lt;tt&gt;rp.type="p"&lt;/tt&gt; option, which makes a line plot.  The result is shown below, but the lines connecting the dots get most of the ink and are not what we care about today.&lt;br /&gt;&lt;a href="http://2.bp.blogspot.com/-I5zs6td0r3U/T3XygTYBSAI/AAAAAAAADcI/Ts6hQk4Rqlo/s1600/circplot2.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 327px;" src="http://2.bp.blogspot.com/-I5zs6td0r3U/T3XygTYBSAI/AAAAAAAADcI/Ts6hQk4Rqlo/s400/circplot2.png" border="0" alt=""id="BLOGGER_PHOTO_ID_57257491http://www.blogger.com/img/blank.gif38042210306" /&gt;&lt;/a&gt;&lt;br /&gt;Either plot demonstrates clearly that a typical average temperature in Albany is about 60 to 80 in August and about 10 to 35 in January, the coldest monthttp://www.blogger.com/img/blank.gifh.  &lt;br /&gt;&lt;br /&gt;&lt;b&gt;Update&lt;/b&gt;&lt;br /&gt;The top figure shows that it has in fact been quite a warm winter-- most of the black asterisks are near the outside of the range of red dots.  Updating with more recent weeks will likely increase this impression.  In the first edition of this post, the &lt;tt&gt;radial.lim&lt;/tt&gt; option was omitted, which resulted in different axes in the original and "add" calls to &lt;tt&gt;radial.plot&lt;/tt&gt;.  This made the winter look much cooler.  Many thanks to Robert Allison for noticing the problem in the main plot.  Robert has made many hundreds of beautiful graphics in SAS, which can be found &lt;a href="http://robslink.com/SAS/Home.htm"&gt;here&lt;/a&gt;.  He also has a &lt;a href="http://www.amazon.com/gp/product/1607649896/ref=as_li_ss_tlhttp://www.blogger.com/img/blank.gifhttp://www.blogger.com/img/blank.gif?ie=UTF8&amp;tag=sasandrblog-20&amp;linkCode=as2&amp;camp=1789&amp;creative=390957&amp;creativeASIN=1607649896"&gt;book&lt;/a&gt;.  Robert also created a version of the plot above in SAS, which you can find &lt;a href="http://robslink.com/SAS/democd55/albany_ny_circular.htm"&gt;here&lt;/a&gt;, with code &lt;a href="http://robslink.com/SAS/democd55/albany_ny_circular.sas"&gt;here&lt;/a&gt;.  Both SAS and R (not to mention a host of other environments) are sufficiently general and flexible that you can do whatever you want to do-- but varying amounts of expertise might be required.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An unrelated note about aggregators&lt;/b&gt;&lt;br /&gt;We love aggregators!  Aggregators collect blogs that have similar coverage for the convenience of readers, and for blog authors they offer a way to reach new audiences.  &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt; is aggregated by &lt;a href="http://www.r-bloggers.com/"&gt;R-bloggers&lt;/a&gt; and &lt;a href="http://proc-x.com/"&gt;PROC-X&lt;/a&gt; with our permission, and by at least 2 other aggregating services which have never contacted us.  If you read this on an aggregator that does not credit the blogs it incorporates, please come visit us at &lt;a href="http://sas-and-r.blogspot.com/"&gt;SAS and R&lt;/a&gt;.  We answer comments there and offer direct subscriptions if you like our content.  In addition, no one is allowed to profit by this work under our &lt;a href="http://creativecommons.org/licenses/by-nc-sa/3.0/"&gt;license&lt;/a&gt;; if you see advertisements on this page, the aggregator is violating the terms by which we publish our work.&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:J3aVl1i_38o"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=J3aVl1i_38o" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:63t7Ie-LG7Y"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?d=63t7Ie-LG7Y" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wmqxif5110c:ej2aOXeFRGA:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/SASandR?a=wmqxif5110c:ej2aOXeFRGA:gIN9vFwOqvQ"&gt;&lt;img src="http://feeds.feedburner.com/~ff/SASandR?i=wmqxif5110c:ej2aOXeFRGA:gIN9vFwOqvQ" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SASandR/~4/wmqxif5110c" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://sas-and-r.blogspot.com/feeds/4776174414004002399/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=1275149608391671670&amp;postID=4776174414004002399" title="10 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/4776174414004002399?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/1275149608391671670/posts/default/4776174414004002399?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SASandR/~3/wmqxif5110c/example-925-its-been-mighty-warm-winter.html" title="Example 9.25: It's been a mighty warm winter? (Plot on a circular axis)" /><author><name>Ken Kleinman</name><uri>http://www.blogger.com/profile/09525118721291529157</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="28" height="32" src="http://3.bp.blogspot.com/_zLwIdu2sLKM/SjKezvboI7I/AAAAAAAACOY/GiDRa1D4MVA/S220/kleinman.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-TJX5QID-DWA/T3zsQLUGM8I/AAAAAAAADdU/ebhawJFKW1c/s72-c/tempcorrected.png" height="72" width="72" /><thr:total>10</thr:total><feedburner:origLink>http://sas-and-r.blogspot.com/2012/04/example-925-its-been-mighty-warm-winter.html</feedburner:origLink></entry></feed>
