<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/atom10full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" gd:etag="W/&quot;C0QDSXs6eip7ImA9WhRbEUw.&quot;"><id>tag:blogger.com,1999:blog-29815492</id><updated>2012-02-01T10:49:38.512-05:00</updated><category term="PROC NPAR1WAY" /><category term="quantile computing" /><category term="PROC GLMSELECT" /><category term="PROC SCORE" /><category term="SVD" /><category term="HPGLIMMIX" /><category term="Boost Algorithms" /><category term="Hash Object" /><category term="Tensor" /><category term="PROC FASTCLUS" /><category term="Gap Statistic" /><category term="PROC MEANS" /><category term="PROC CORR" /><category term="PROC EXPAND" /><category term="PROC REG" /><category term="SAS" /><category term="PROC PLS" /><category term="Index" /><category term="K/N Algorithm" /><category term="PROC SQL" /><category term="PROC GLIMMIX" /><category term="PROC DISTANCE" /><category term="PROC DISCRIM" /><category term="HOSVD" /><category term="AUC" /><category term="Bayesian" /><category term="PROC ORTHOREG" /><category term="PROC MIXED" /><category term="Data Mining" /><category term="Gini Index" /><category term="PROC GPLOT" /><category term="Moore-Penrose pseudoinverse" /><category term="GRAPH" /><category term="PROC STDIZE" /><category term="random number" /><category term="Data Manipulation" /><category term="PROC GENDMO" /><category term="Statistical Graphics" /><category term="PROC PRINCOMP" /><category term="Filter" /><category term="Format" /><category term="Array" /><category term="KNN" /><category term="Nearest Neighbor" /><category term="multi-threading" /><category term="K-means Clustering" /><category term="PROC FACTOR" /><category term="PROC APPEND" /><category term="PROC FORMAT" /><category term="kernel" /><category term="PROC HPMIXED" /><category term="PROC STANDARD" /><category term="Macro Programming" /><category term="PROC CANDISC" /><category term="Random Split" /><category term="PROC GLMMOD" /><category term="predictive modeling" /><category term="PCA" /><category term="R" /><category term="PROC UNIVARIATE" /><title>SAS Programming for Data Mining Applications</title><subtitle type="html">© Copyright 2006-2011 /
SAS® and all other SAS Institute Inc. product or service names are registered trademarks or trademarks of SAS Institute Inc. in the USA and other countries. ® indicates USA registration. Other brand and product names are trademarks of their respective companies.</subtitle><link rel="http://schemas.google.com/g/2005#feed" type="application/atom+xml" href="http://www.sas-programming.com/feeds/posts/default" /><link rel="alternate" type="text/html" href="http://www.sas-programming.com/" /><link rel="next" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default?start-index=26&amp;max-results=25&amp;redirect=false&amp;v=2" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><generator version="7.00" uri="http://www.blogger.com">Blogger</generator><openSearch:totalResults>52</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/atom+xml" href="http://feeds.feedburner.com/SasProgramming" /><feedburner:info uri="sasprogramming" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><entry gd:etag="W/&quot;DkIAQ38ycCp7ImA9WhRbEEs.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-3028162046104849377</id><published>2012-01-31T21:45:00.001-05:00</published><updated>2012-01-31T21:49:02.198-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-01-31T21:49:02.198-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC PRINCOMP" /><category scheme="http://www.blogger.com/atom/ns#" term="multi-threading" /><category scheme="http://www.blogger.com/atom/ns#" term="PCA" /><title>Multi-Threaded Principle Component Analysis</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/8sPIHEdqu2clcEi_NVkerajCRQQ/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/8sPIHEdqu2clcEi_NVkerajCRQQ/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/8sPIHEdqu2clcEi_NVkerajCRQQ/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/8sPIHEdqu2clcEi_NVkerajCRQQ/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-cJHtENUrbws/Tyil0xyStRI/AAAAAAAAAbU/RCpoC5M-WGQ/s1600/PCA+Multithreading1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="377" src="http://4.bp.blogspot.com/-cJHtENUrbws/Tyil0xyStRI/AAAAAAAAAbU/RCpoC5M-WGQ/s640/PCA+Multithreading1.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-oph0w9rWk1Q/Tyil3GfeRdI/AAAAAAAAAbc/Kl11OfZgrkE/s1600/PCA+Multithreading2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="225" src="http://3.bp.blogspot.com/-oph0w9rWk1Q/Tyil3GfeRdI/AAAAAAAAAbc/Kl11OfZgrkE/s640/PCA+Multithreading2.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
SAS used to not support multithreading in PCA, then I figured out that its server version supports this functionality, see &lt;a href="http://listserv.uga.edu/cgi-bin/wa?A2=ind1009d&amp;amp;L=sas-l&amp;amp;D=0&amp;amp;P=4318" target="_blank"&gt;here&lt;/a&gt;. Today, I found this mutlithreading capability is finally available in PC SAS v9.22.&lt;br /&gt;
&lt;br /&gt;
The figure above indicates that all 4 threads in my PC are utilized. FYI, My PC uses an Intel 2core 4threads CPU. This multi-threading capability directly help any work relying on SVD due to the direct relationshipbetween SVD and PC, see &lt;a href="http://www.sas-programming.com/2010/03/macro-for-svd.html" target="_blank"&gt;here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
Notice that in order to observe the effect of mutli-threading by comparing Real User Time and CPU Time, I/O should not be a bottleneck, that is why in the code, all outputs, either to screen or to data sets, are suppressed.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

options fullstimer;
data _junky;
     length id x: 8;
  array x{800};
  do id=1 to 5E3;
     do j=1 to dim(x);
     x[j]=ranuni(0);
  end;
  drop j; output;
  end;
run;

proc princomp data=_junky noprint;
      var x:;
run;

&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-3028162046104849377?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/GUoIlOc0y8U" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/3028162046104849377/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=3028162046104849377" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/3028162046104849377?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/3028162046104849377?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/GUoIlOc0y8U/multi-threaded-principle-component.html" title="Multi-Threaded Principle Component Analysis" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-cJHtENUrbws/Tyil0xyStRI/AAAAAAAAAbU/RCpoC5M-WGQ/s72-c/PCA+Multithreading1.png" height="72" width="72" /><thr:total>1</thr:total><feedburner:origLink>http://www.sas-programming.com/2012/01/multi-threaded-principle-component.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0MDSH44cCp7ImA9WhRUGUU.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-5189434209973827446</id><published>2012-01-30T11:32:00.001-05:00</published><updated>2012-01-30T22:44:39.038-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-01-30T22:44:39.038-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Statistical Graphics" /><category scheme="http://www.blogger.com/atom/ns#" term="random number" /><title>Random Number Seeds: NOT only the first one matters!</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/vDgBhiunYJOcNzArarbiPLwrlzA/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/vDgBhiunYJOcNzArarbiPLwrlzA/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/vDgBhiunYJOcNzArarbiPLwrlzA/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/vDgBhiunYJOcNzArarbiPLwrlzA/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-tAPHsHdsVxs/Tydjmj6yijI/AAAAAAAAAbM/OlA2hfloC0w/s1600/SGScatter+%282%29.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://2.bp.blogspot.com/-tAPHsHdsVxs/Tydjmj6yijI/AAAAAAAAAbM/OlA2hfloC0w/s400/SGScatter+%282%29.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Today, Rick (blog @ &lt;a href="http://blogs.sas.com/content/iml/" target="_blank"&gt;here&lt;/a&gt;) wrote an&amp;nbsp;&lt;a href="http://blogs.sas.com/content/iml/2012/01/30/random-number-seeds-only-the-first-seed-matters/?utm_source=feedburner&amp;amp;utm_medium=feed&amp;amp;utm_campaign=Feed%3A+TheDoLoop+%28The+DO+Loop%29" target="_blank"&gt;article&lt;/a&gt; about random number seed in SAS to be used in random number functions in DATA Step. Rick noticed when multiple random number functions are called using different seeds, only the first one matters. &lt;br /&gt;
&lt;br /&gt;
This is so true. In fact, SAS Manual also has a comprehensive writting on this issue,&amp;nbsp;namely, how to control seeds for each iteration of the random number generation process and how to generatie multiple statistically independent streams of random numbers, see &lt;a href="http://support.sas.com/documentation/cdl/en/lefunctionsref/63354/HTML/default/viewer.htm#p026ygl6toz3tgn14lt4iu6cl5bb.htm#p04slf135yfs1sn1vrvrvso072d3" target="_blank"&gt;here&lt;/a&gt;. In fact, sasCommunity.org also has an article about this issue, see &lt;a href="http://www.sascommunity.org/wiki/How_the_SAS_Random_Number_Generators_Work" target="_blank"&gt;here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
To&amp;nbsp;echo Rick's post,&amp;nbsp; there is a way to control the seed so that NOT only the first one matters: use CALL routines which by theory will generate computationally independent random number sequence. But if the two seeds are too close, the generated sequences may not be statistically independent. Again, refer to the SAS manual for details.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

data normal;
   seed1 = 11111;
   seed2 = 22222;
   seed3 = 383333;
   do i = 1 to 1000;
      call rannor(seed1, x1);
      call rannor(seed2, x2);   
      call rannor(seed3, x3);
   x4=rannor(seed2);
   x5=rannor(seed3);
      output;
   end;
run;

proc sgscatter data=normal;
   matrix x1-x5/ markerattrs = (size = 1);
run; 
 
&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-5189434209973827446?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/E2VhkyIM6Rk" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/5189434209973827446/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=5189434209973827446" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5189434209973827446?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5189434209973827446?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/E2VhkyIM6Rk/today-rick-blog-here-wrote-article.html" title="Random Number Seeds: NOT only the first one matters!" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-tAPHsHdsVxs/Tydjmj6yijI/AAAAAAAAAbM/OlA2hfloC0w/s72-c/SGScatter+%282%29.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2012/01/today-rick-blog-here-wrote-article.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0MESX04fCp7ImA9WhRSF04.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-8395996439289391529</id><published>2011-11-16T15:12:00.001-05:00</published><updated>2011-11-19T16:50:08.334-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-11-19T16:50:08.334-05:00</app:edited><title>Using PROC CANCORR to solve large scale PLS problem</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/OIAF-beNO2EBfQO6E5hUVHZkGcM/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/OIAF-beNO2EBfQO6E5hUVHZkGcM/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/OIAF-beNO2EBfQO6E5hUVHZkGcM/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/OIAF-beNO2EBfQO6E5hUVHZkGcM/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-ZwI8gYhmLHw/TsggxflCfPI/AAAAAAAAAa4/R0wnudgvm6w/s1600/PLS.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="300" src="http://1.bp.blogspot.com/-ZwI8gYhmLHw/TsggxflCfPI/AAAAAAAAAa4/R0wnudgvm6w/s400/PLS.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Partial Least Square (PLS) is a powerful tool for discriminant analysis with large number of predictors [1].&lt;br /&gt;
&lt;br /&gt;
PLS extracts latent factors that maximize the covariance between independent variables and dependent variables. This process is equivalent to Generalized Eigenvalue Decomposition of the following formula [2]:&lt;br /&gt;
$$X'HXw =\phi X'Xw $$. For PLS $$H=Y'Y$$ Note that Canonical Correlation Analysis (CCA) follows the same generalized eigenvalue decomposition problem, specifically, for CCA, $$H=Y'(YY')^{-1}Y$$.&lt;br /&gt;
&lt;br /&gt;
In SAS, PROC PLS implements 2 forms of PLS, namely the original NIPALS [2] and SIMPLS [3]. When there is only one dependent variable, the two algorithms generate the same output.PLS is a computationally very demanding algorithm. While powerful, when the dimension of the problem at hand becomes very large, PROC PLS will encounter issues such as insufficient memory and very long computing time. 

There is a rescue when only one dependent variable Y presents. In this case, CCA and PLS differ only up to a fixed scale parameter. Therefore we can use PROC CANCORR, which is very scalable and multithreaded, to solve the PLS problem. The obtained weights and loadings will not be the same but the difference is only up to a fixed scale parameter.&lt;br /&gt;
&lt;br /&gt;
In the follow log, we demonstrate the behavior of PROC PLS and PROC CANCORR on a server with 4GB accessible memory when the number of independent variable is 5000 and sample size is 100K. PROC PLS reported insufficient memory and stopped computing in 45seconds after exhausting all accessible memory, while PROC CANCORR continued and finished computation in slightly more than 7 minutes. Both procedures used up 3.92GB memory available to SAS from the system. Also note that PROC CANCORR used more than 33minutes of CPU time, indicating its very good scale up capability in a multi-core environment.&lt;br /&gt;
&lt;br /&gt;
Referece:&lt;br /&gt;
[1] Barker, M and Rayens, W (2003), "Partial Least Squares for Discrimination", &lt;i&gt;Journal of&amp;nbsp;&lt;/i&gt; &lt;i&gt;Chemometrics&lt;/i&gt;, 17, 166-173&lt;br /&gt;
&lt;br /&gt;
&lt;span class="style1"&gt;[2] Sun, L; Ji, S; Yu, S; and Ye, J (2009), 
 "&lt;a href="http://www.public.asu.edu/%7Elsun27/Publications/IJCAI_2009.pdf"&gt;On the Equivalence Between Canonical Correlation Analysis and  Orthonormalized Partial Least Squares&lt;/a&gt;", In &lt;i&gt;Proceedings of the 21st 
 International Joint Conference on Artificial Intelligence&lt;/i&gt; (&lt;b&gt;IJCAI 2009)&lt;/b&gt;.&lt;/span&gt;&lt;br /&gt;
&lt;br /&gt;
[3] Wold, H. (1966), “Estimation of Principal Components and Related Models by Iterative Least Squares,” in P.&amp;nbsp;R. Krishnaiah, ed., &lt;i&gt;Multivariate Analysis&lt;/i&gt;, New York: Academic Press. &lt;br /&gt;
&lt;br /&gt;
[4] de&amp;nbsp;Jong, S. (1993), “SIMPLS: An Alternative Approach to Partial Least Squares Regression,” &lt;i&gt;Chemometrics and Intelligent Laboratory Systems&lt;/i&gt;, 18, 251–263. &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
NOTE: PROCEDURE PRINTTO used (Total process time):
      real time           0.00 seconds
      user cpu time       0.00 seconds
      system cpu time     0.00 seconds
      Memory                            3920274k
      OS Memory                         3926280k
      Timestamp            11/16/2011  2:11:22 PM
      Page Faults                       0
      Page Reclaims                     9
      Page Swaps                        0
      Voluntary Context Switches        1
      Involuntary Context Switches      1
      Block Input Operations            0
      Block Output Operations           0
      

16         options fullstimer;
17         data x;
18              length id y x: 8;
19           array x{5000};
20              do id=1 to 1E5;
21              y=rannor(0);
22              do j=1 to dim(x);
23              x[j]=rannor(0);
24           end;
25           output;
26           drop j;
27           end;
28         run;

NOTE: The data set WORK.X has 100000 observations and 5002 variables.
NOTE: Compressing data set WORK.X increased size by 0.06 percent. 
      Compressed is 100066 pages; un-compressed would require 100009 pages.
NOTE: DATA statement used (Total process time):
      real time           1:07.58
      user cpu time       1:02.41
      system cpu time     5.16 seconds
      Memory                            3920274k
      OS Memory                         3926280k
      Timestamp            11/16/2011  2:12:29 PM
      Page Faults                       0
      Page Reclaims                     510
      Page Swaps                        0
      Voluntary Context Switches        63
      Involuntary Context Switches      107
      Block Input Operations            0
      Block Output Operations           0
      

29         
30         
31         proc pls data=x method=simpls noprint;
32              model y =x1-x5000;
33         run;

ERROR: The SAS System stopped processing this step because of insufficient memory.
NOTE: There were 100000 observations read from the data set WORK.X.
NOTE: PROCEDURE PLS used (Total process time):
      real time           45.89 seconds
      user cpu time       40.11 seconds
      system cpu time     5.77 seconds
      Memory                            3920284k
      OS Memory                         3926280k
      Timestamp            11/16/2011  2:13:15 PM
      Page Faults                       0
      Page Reclaims                     978075
      Page Swaps                        0
      Voluntary Context Switches        91
      Involuntary Context Switches      162
      Block Input Operations            0
      Block Output Operations           0
      
35         proc cancorr data=x noprint;
36              var y;
37           with x1-x5000;
38         run;

NOTE: PROCEDURE CANCORR used (Total process time):
      real time           7:02.85
      user cpu time       33:13.85
      system cpu time     4.40 seconds
      Memory                            3920284k
      OS Memory                         3926280k
      Timestamp            11/16/2011  2:20:18 PM
      Page Faults                       0
      Page Reclaims                     126339
      Page Swaps                        0
      Voluntary Context Switches        2096
      Involuntary Context Switches      83359
      Block Input Operations            0
      Block Output Operations           0
      

39    
40         proc printto; run;

&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-8395996439289391529?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/5ixEz_Y5cX8" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/8395996439289391529/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=8395996439289391529" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8395996439289391529?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8395996439289391529?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/5ixEz_Y5cX8/using-proc-cancorr-to-solve-large-scale.html" title="Using PROC CANCORR to solve large scale PLS problem" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-ZwI8gYhmLHw/TsggxflCfPI/AAAAAAAAAa4/R0wnudgvm6w/s72-c/PLS.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/11/using-proc-cancorr-to-solve-large-scale.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUQNQn07fip7ImA9WhRTFUo.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-2415426124549242306</id><published>2011-10-06T14:21:00.006-04:00</published><updated>2011-11-06T06:03:13.306-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-11-06T06:03:13.306-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC CORR" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC SCORE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC REG" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>Obtain Trace of the Projection Matrix in a Linear Regression</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/RV64_AdUly3H7ZUkcDyZbgPhlnQ/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/RV64_AdUly3H7ZUkcDyZbgPhlnQ/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/RV64_AdUly3H7ZUkcDyZbgPhlnQ/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/RV64_AdUly3H7ZUkcDyZbgPhlnQ/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Recently, I am working on coding in SAS for a set of regularized regressions and need to compute trace of the projection matrix:&lt;br /&gt;
$$ S=X(X'X + \lambda I)^{-1}X' $$.&lt;br /&gt;
&lt;br /&gt;
Wikipedia has a well written introduction to Trace @&lt;a href="http://en.wikipedia.org/wiki/Trace_%28linear_algebra%29"&gt; here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
To obtain the inverse of matrix (X'X + \lambda I) in SAS/STAT, there are multiple ways:&lt;br /&gt;
1. Build the SSCP matrix first, then inverse it using the following methods:&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.1. Using SVD based method, shown @ &lt;a href="http://www.sas-programming.com/2009/07/svd-in-sas-without-iml.html"&gt;here&lt;/a&gt;&amp;nbsp;but not demonstrated in this post;&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.2. Using PROC REG, shown @ &lt;a href="http://www.sas-programming.com/2009/08/matrix-inversion-in-sas-stat.html"&gt;here&lt;/a&gt;&amp;nbsp;, and also demonstrated in the code below;&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.3. Using SWEEP operator, shown @ &lt;a href="http://www.sas-programming.com/2011/08/rolling-analysis-of-time-series.html"&gt;here&lt;/a&gt; under Method 0 and Method 1;&lt;br /&gt;
2. Since (X'X + \lambda I)&amp;nbsp;is the SSCP for a Ridge Regression with ridge parameter=\lambda, it is handy to directly use the ODS OUTPUT InvXPX= statement to obtain the inversed matrix, when X is appended by a diagonal matrix of \lambda*I_{p, p}. SAS code is&amp;nbsp;demonstrated below;&lt;br /&gt;
&lt;br /&gt;
Trace of the project matrix S is a key concept in modern regression analysis. For example, the effective degree of freedom of the model in a regularized linear regression is trace(S)/N, see [1] for details. For another example, in approximating leave-one-out-cross-validation using GCV, trace(S)/N is a key component (formula 7.52 of ~[1]). &lt;br /&gt;
&lt;br /&gt;
Check the reference and the cited works therein for more information.&lt;br /&gt;
&lt;div style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none;"&gt;
Ref:&lt;/div&gt;
&lt;div style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none;"&gt;
1. Hastie et al., The Elements of Statistical Learning, 2nd Ed.&lt;/div&gt;
&lt;div style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
/* Use the SocEcon example data created in
   Example A.1: A TYPE=CORR Data Set Produced by PROC CORR
   on page 8153 of SAS/STAT 9.22 User Doc
*/
data SocEcon;
input Pop School Employ Services House;
datalines;
5700 12.8 2500 270 25000
1000 10.9 600 10 10000
3400 8.8 1000 10 9000
3800 13.6 1700 140 25000
4000 12.8 1600 140 25000
8200 8.3 2600 60 12000
1200 11.4 400 10 16000
9100 11.5 3300 60 14000
9900 12.5 3400 180 18000
9600 13.7 3600 390 25000
9600 9.6 3300 80 12000
9400 11.4 4000 100 13000
;
run;

%let depvar=HOUSE;
%let covars=pop school employ services;
%let lambda=1;
/* Below is the way to obtain trace(S), where S is the project matrix in a (regularized) linar regression. 
   For further information, check pp.68, pp.153 of Elements of Statistical Learning,2nd Ed.
*/

/* For details about TYPE=SSCP special SAS data, consult:
  Appendix A: Special SAS Data Sets, pp.8159 of SAS/STAT 9.22 User's Guide
*/
proc corr data=SocEcon sscp out=xtx(where=(_TYPE_='SSCP'))  noprint;
     var &amp;amp;covars;
run;


data xtx2;
     set xtx;
  array _n{*} _numeric_;
  array _i{*} i1-i5 (5*0);
  do j=1 to 5;
     if j=_n_ then _i[_n_]=λ
  else _i[j]=0;
  end;
  _n[_n_]=_n[_n_]+1;
  drop j _TYPE_  _NAME_;
run;

/* Obtain the inverse of (XTX+\lambda*I)
   Note that we explicitly specified Intercept term in the 
   covariate list and fit a model without implicit intercept 
   term in the model.
*/
proc reg data=xtx2  
         outest=S0(type=SSCP
                   drop=i1-i5 _MODEL_  _DEPVAR_  _RMSE_)
         singular=1E-17;
     model i1-i5 = Intercept &amp;amp;covars / noint   noprint;
run;quit;

data S0;
     set S0; 
  length _NAME_ $8;
  _NAME_=cats('X_', _n_);
run;

proc score data=SocEcon  score=S0  out=XS0(keep=X_:)  type=parms;
     var &amp;amp;covars;
run;     

data XS0X;
     merge XS0  X;
  array _x1{*} X_:;
  array _x0{*} intercept pop school employ services;
  do i=1 to dim(_X1);
     _x1[i]=_x1[i]*_x0[i];
  end;
  rowSum=sum(of _x1[*]);
  keep rowSum;
run;
proc means data=XS0X  noprint;
     var rowSum;
  output out=trace  sum(rowSum)=Trace;
run;
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;img border="0" height="361" src="http://1.bp.blogspot.com/-ry9Wz4E6IIU/To5D1DErv3I/AAAAAAAAAaY/X2MPxQmkOzE/s640/trace.png" width="640" /&gt;&lt;br /&gt;
Verify the result using R:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
&amp;gt; socecon&amp;lt;-read.csv('c:/socecon.csv', header=T)
&amp;gt; x&amp;lt;-as.matrix(cbind(1, socecon[,-5]))
&amp;gt; xtx&amp;lt;-t(x)%*%x
&amp;gt; phi&amp;lt;-xtx+diag(rep(1, 5))
&amp;gt; 
&amp;gt; # method 1. 
&amp;gt; S&amp;lt;-x%*%solve(phi)*x
&amp;gt; sum(S)
[1] 4.077865
&amp;gt; # method 2. 
&amp;gt; S&amp;lt;-(x%*%solve(phi))%*%t(x)
&amp;gt; sum(diag(S))
[1] 4.077865
&amp;gt;
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
Of course, except for method 1.2 shown above, we can also use Method 2 mentioned above, and obtain the same inverse matrix:&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

data SocEcon2 /view=SocEcon2;
     set x  end=eof;
  array x{5} Intercept &amp;amp;covars (5*0);
  Intercept=1;
  output;
  if eof then do;
     do j=1 to dim(x);
     x[j]=λ
     output;
     drop j;
     x[j]=0;
  end;
  end;
run;

ods select none;
ods output InvXPX=S1;
proc reg data=SocEcon2  singular=1E-17;
     model y = Intercept &amp;amp;covars /noint  i;
run; quit;
ods select all;

&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
As a side point, it is also of interests to compare the computational&amp;nbsp;performance of both illustrated methods. SVD-based approach and SWEEP operator based approach are omitted.&lt;br /&gt;
&lt;br /&gt;
Using a SAS data set of 1001 covariates (including Intercept term) and 1E6 observations, total 7.6GB on a windows PC, the test was done on two dramatically different machines: one WindowsXP&amp;nbsp;laptop with mediocre HDD and a 2-core T7300 CPU; the other one is a high-end Server running Linux64 with Disk Array and fast CPUs totaled 16 cores. &lt;br /&gt;
&lt;br /&gt;
On the PC,:&lt;br /&gt;
-&amp;gt; Using method 1.2, the time used decomposition is listed below:&lt;br /&gt;
PROC CORR:&amp;nbsp; real time: 25:47.72; CPU time: 15:26.15&lt;br /&gt;
DATA step on XTX: real time: 2.28 sec; CPU time: 0.07 sec&lt;br /&gt;
PROC REG :&amp;nbsp; real time: 15.45 sec; CPU time: 27.31 sec;&lt;br /&gt;
Total real time: 26:05.45&lt;br /&gt;
&lt;br /&gt;
-&amp;gt; Using method&amp;nbsp; 2, the time used is:&lt;br /&gt;
PROC REG: real time: 48:28.44; CPU time: 1:16:46.41&lt;br /&gt;
&lt;br /&gt;
On the server:&lt;br /&gt;
&amp;nbsp;-&amp;gt; Using method 1.2, the time used decomposition is listed below:&lt;br /&gt;
PROC CORR: real time: 5:40.61; CPU time: 5:40.58&lt;br /&gt;
DATA step on XTX: real time: 0.05 sec; CPU time: 0.05 sec&lt;br /&gt;
PROC REG: real time 1.71 sec; CPU time: 5.27 sec&lt;br /&gt;
Total real time: 5:42.37&lt;br /&gt;
&lt;br /&gt;
-&amp;gt; Using method 2, the time used is:&lt;br /&gt;
PROC REG: real time: 6:01.46; CPU time: 19.13.49&lt;br /&gt;
&lt;br /&gt;
The performance is summarized below:&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://4.bp.blogspot.com/-TVmXAeKSvAw/TrZiqEgpFnI/AAAAAAAAAak/fp7Z56hXkiI/s1600/summary.png" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="161" src="http://4.bp.blogspot.com/-TVmXAeKSvAw/TrZiqEgpFnI/AAAAAAAAAak/fp7Z56hXkiI/s640/summary.png" width="640" /&gt;&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-2415426124549242306?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/YlSd0NVVhqA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/2415426124549242306/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=2415426124549242306" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2415426124549242306?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2415426124549242306?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/YlSd0NVVhqA/obtain-trace-of-projection-matrix-in.html" title="Obtain Trace of the Projection Matrix in a Linear Regression" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-ry9Wz4E6IIU/To5D1DErv3I/AAAAAAAAAaY/X2MPxQmkOzE/s72-c/trace.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/10/obtain-trace-of-projection-matrix-in.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DEcMQXYzeSp7ImA9WhdQGEw.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-2224946647232293217</id><published>2011-08-18T15:18:00.007-04:00</published><updated>2011-08-20T00:54:40.881-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-08-20T00:54:40.881-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC HPMIXED" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GENDMO" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC MIXED" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLMSELECT" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC REG" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLIMMIX" /><title>Benchmark Regression Procedures using OLS Regression</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/GvfFIbdx2k5Cff7dNKMWQeQhLIs/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/GvfFIbdx2k5Cff7dNKMWQeQhLIs/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/GvfFIbdx2k5Cff7dNKMWQeQhLIs/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/GvfFIbdx2k5Cff7dNKMWQeQhLIs/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Rick Wicklin discussed in his &lt;i&gt;&lt;b&gt;&lt;a href="http://blogs.sas.com/content/iml/2011/08/17/solving-linear-systems-which-technique-is-fastest/?utm_source=feedburner&amp;amp;utm_medium=feed&amp;amp;utm_campaign=Feed%3A+TheDoLoop+%28The+DO+Loop%29"&gt;blog&lt;/a&gt;&lt;/b&gt;&lt;/i&gt; the performance in solving a linear system using SOLVE() function and INV() function from IML. &lt;br /&gt;
&lt;br /&gt;
Since regression analysis is an integral part of SAS applications and there are many SAS procedures in SAS/STAT that are capable to conduct various regression analysis, it would be interesting to benchmark their relative performance using OLS regression, the fundamental regression analysis of all.&lt;br /&gt;
&lt;br /&gt;
The analysis will compare REG, GLMSELECT, GENMOD, MIXED, GLIMMIX,&amp;nbsp;GLM, ORTHOREG, HPMIXED and TRANSREG on 10 OLS regressions with 100 to 1000 variables, incremental at 100, and with the number of observations twice the number of variables to avoid possible numerical issues. HPMIXED uses sparse matrix techniques and will be put into great disadvantage in this comparison using large dense matrices. A macro wraps them together:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

%macro wrap;
proc printto log='c:\testlog.txt';run;

%let t0=%sysfunc(datetime(), datetime.);
%let procnames=GLM REG GLMSELECT ORTHOREG MIXED GLIMMIX GENMOD ;
%let nproc=%sysfunc(countW(&amp;amp;procnames));
%put Test &amp;amp;nproc PROCEDURES;
%do i=1 %to 10;
    %let nobs=%sysevalf(&amp;amp;i*100);
    options nonotes;
    data _temp;
         array x{&amp;amp;nobs};
	 do i=1 to 2*&amp;amp;nobs;
	    do j=1 to &amp;amp;nobs;
	       x[j]=rannor(0);
            end;
	    y=rannor(0);
	    drop i j;
	    output;
	 end;		 
     run;
     options notes;
     sasfile _temp load;
     ods select none;

     %do j=1 %to &amp;amp;nproc;
         %let proc=%scan(&amp;amp;procnames, &amp;amp;j);
	 %put &amp;amp;proc;
	 proc &amp;amp;proc data=_temp;
	      model y = x1-x&amp;amp;nobs;
	 run;
    %end;
    %put TRANSREG;
    proc transreg data=_temp;
         model identity(y) = identity(x1-x&amp;amp;nobs);
    run;
    sasfile _temp close;
    ods select all;
%end;
proc printto; run;
%mend;
%wrap;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
After running all iterations, the SAS log is parsed to obtain procedure names and corresponding real time and CPU time. The following SAS code does this job:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

data proc_compare;
     infile "c:\testlog.txt";
	 input;
	 retain procedure ;
	 retain realtime cputime  realtime2 ; 
	 length procedure $12.;
	 length realtime  cputime $24.;
	 if _n_=1 then id=0;
	 x=_infile_;
	 if index(x, 'PROCEDURE')&amp;gt;0 then do;
	    procedure=scan(_infile_, 3);		
		if procedure="REG" then id+1;		
	 end;
	
	 if index(x, 'real time')&amp;gt;0 then do;
	    _t1=index(_infile_, 'real time');
	    _t2=index(_infile_, 'seconds');
	    if _t2=0 then _t2=length(_infile_);
            realtime=substr(_infile_, _t1+9, _t2-_t1-9);
	    if index(realtime, ':')&amp;gt;0 then do;
 	       realtime2=scan(realtime, 1, ':')*60;
	       sec=input(substr(realtime, index(realtime, ':')+1), best.);
	       realtime2=realtime2+sec;		 
	    end;
	    else realtime2=input(compress(realtime), best.);
	 end;
	 if index(x, 'cpu time')&amp;gt;0 then do;
	    _t1=index(_infile_, 'cpu time');
	    _t2=index(_infile_, 'seconds');
	    if _t2=0 then _t2=length(_infile_);
	    cputime=substr(_infile_, _t1+8, _t2-_t1-8);
	    if index(cputime, ':')&amp;gt;0 then do;
 	       cputime2=scan(cputime, 1, ':')*60;
	       sec=input(substr(cputime, index(cputime, ':')+1), best.);
	       cputime2=cputime2+sec;
	    end;
	    else cputime2=input(compress(cputime), best.);
	    keep id size  procedure cputime2 realtime2 ;
	    size=id*100;
	    if compress(procedure)^="PRINTTO" then output;
	end;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
We then visualize the results using the following code:&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

title "Benchmark Regression PROCs using OLS";
proc sgpanel data=proc_compare;
     panelby procedure /rows=2;
     series y=cputime2  x=size/ lineattrs=(thickness=2);
	 label cputime2="CPU Time (sec)"
	       size="Problem Size"
		   ;;	
	 colaxis grid;
	 rowaxis grid;
run;
title;

title "Closer Look on REG vs. GLM vs. GLMSELECT";
proc sgplot data=proc_compare  uniform=group;
     where procedure in ("GLMSELECT", "REG", "GLM");
     series x=size y=cputime2/group=procedure  curvelabel lineattrs=(thickness=2);
	 label cputime2="CPU Time (sec)"
	       size="# of Variables"
		   ;;
     yaxis grid ;
     xaxis grid ;
run;
title;

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-17zeM19r-4A/Tk2x0XiHshI/AAAAAAAAAZ8/uNsS0dfQpXE/s1600/BENCHMARK1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://2.bp.blogspot.com/-17zeM19r-4A/Tk2x0XiHshI/AAAAAAAAAZ8/uNsS0dfQpXE/s640/BENCHMARK1.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-ctHhjAHbaxU/Tk2x1RXKJ3I/AAAAAAAAAaA/H5_kNtiYFRg/s1600/BENCHMARK2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://2.bp.blogspot.com/-ctHhjAHbaxU/Tk2x1RXKJ3I/AAAAAAAAAaA/H5_kNtiYFRg/s640/BENCHMARK2.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-5EitNToKyww/Tk2x1kMoZnI/AAAAAAAAAaE/u2WLZJg2sS4/s1600/REG+GLM+GLMSELECT.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="480" src="http://2.bp.blogspot.com/-5EitNToKyww/Tk2x1kMoZnI/AAAAAAAAAaE/u2WLZJg2sS4/s640/REG+GLM+GLMSELECT.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
It is found that PROC&amp;nbsp;GLM and GLMSELECT beat all other procedures with large margin while HPMIXED is the slowest followed by GLIMMIX. Surprisingly, REG is slower than both GLM and GLMSELECT even though it utilized multi-threading technique while GLMSELECT does not:&lt;br /&gt;
&lt;br /&gt;
************ Partial LOG of the last iteration ********&lt;br /&gt;
NOTE: PROCEDURE REG used (Total process time):&lt;br /&gt;
real time 6.79 seconds&lt;br /&gt;
cpu time 9.36 seconds&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
NOTE: There were 2000 observations read from the data set WORK._TEMP.&lt;br /&gt;
NOTE: PROCEDURE GLMSELECT used (Total process time):&lt;br /&gt;
real time 3.06 seconds&lt;br /&gt;
cpu time 2.96 seconds&lt;br /&gt;
********************************************************&lt;br /&gt;
&lt;br /&gt;
The performance gap between REG and GLM/GLMSELECT is getting larger when the number of variables increases to be more than 700.&lt;br /&gt;
&lt;br /&gt;
Both REG and GLMSELECT are developed by the same group of developers in SAS, as far as I know.&lt;br /&gt;
&lt;br /&gt;
********************* PS : ****************************&lt;br /&gt;
Rick and Charlie pointed out that real time is a more fair measure, which I agree.&lt;br /&gt;
&lt;br /&gt;
The reading of real computing time has large variance from run to run because the testing enviornment is not very clean and there are many background window programs running. Below is part of the log file of another run with 2000 variables and 4000 records:&lt;br /&gt;
&lt;br /&gt;
NOTE: PROCEDURE REG used (Total process time):&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; real time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2.26 seconds&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cpu time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 7.76 seconds&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
NOTE: PROCEDURE GLM used (Total process time):&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; real time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3.57 seconds&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cpu time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 4.58 seconds&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
NOTE: There were 2000 observations read from the data set WORK._TEMP.&lt;br /&gt;
NOTE: PROCEDURE GLMSELECT used (Total process time):&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; real time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3.50 seconds&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cpu time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3.44 seconds&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;br /&gt;
We see that REG has lower real time comparing to GLM/GLMSELECT, even though cpu time is about twice the average of GLM/GLMSELECT. In a case where BY-processing is used, GLMSELECT will use multi-threading as specified in PERFORMANCE statement, and the gap in real time between REG and GLMSELECT will be eliminated. In a collaborating environment, more CPU time also means competing for more resources. Below we show the real time of the same run as in above CPU Time figure.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-lOa-UJMV2lM/Tk800aak2pI/AAAAAAAAAaI/YOIC6IIAOk4/s1600/realtime.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="590" src="http://3.bp.blogspot.com/-lOa-UJMV2lM/Tk800aak2pI/AAAAAAAAAaI/YOIC6IIAOk4/s640/realtime.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Note that CPU Time difference and pattern is pretty consistent.Below is the mean CPU Time and its 90% C.I. of 100 runs using REG /GLM /GLMSELECT on different size of problems. &lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-zVctPASRRhs/Tk81OSz8NvI/AAAAAAAAAaM/6Q9y9XA346o/s1600/Iteration.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="480" src="http://1.bp.blogspot.com/-zVctPASRRhs/Tk81OSz8NvI/AAAAAAAAAaM/6Q9y9XA346o/s640/Iteration.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-2224946647232293217?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/PSNCKHRnfKU" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/2224946647232293217/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=2224946647232293217" title="5 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2224946647232293217?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2224946647232293217?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/PSNCKHRnfKU/benchmark-regression-procedures-using.html" title="Benchmark Regression Procedures using OLS Regression" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-17zeM19r-4A/Tk2x0XiHshI/AAAAAAAAAZ8/uNsS0dfQpXE/s72-c/BENCHMARK1.png" height="72" width="72" /><thr:total>5</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/08/benchmark-regression-procedures-using.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CE8MQ3g6eyp7ImA9WhRREU0.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-2743524723880802672</id><published>2011-08-10T07:38:00.001-04:00</published><updated>2011-11-23T21:48:02.613-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-11-23T21:48:02.613-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC EXPAND" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC REG" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Manipulation" /><title>Rolling Window Regression of Time Series</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/EacvIvV0xfT2zOBl8agZ6EipEr4/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/EacvIvV0xfT2zOBl8agZ6EipEr4/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/EacvIvV0xfT2zOBl8agZ6EipEr4/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/EacvIvV0xfT2zOBl8agZ6EipEr4/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;More often than not, we encounter a problem where an OLS over a rolling time window is required, see [1], [2], [3], [4], [5], [6], [7], for a few examples. &lt;br /&gt;
&lt;br /&gt;
One solution is to resort to SAS MACRO, but it is extremely inefficient and can't handle large dataset in reality, [8]. This method is shown below as Method[4]. It couldn't finish the test in allowable time using the sample data below. &lt;br /&gt;
&lt;br /&gt;
The other common solution is&amp;nbsp;to use the BY-processing capability of PROC REG after re-shaping the data into appropriate format, see [9], [10]. This method is demonstrated as Method[3] below. While certainly much better than above one, it is still not the fastest and requires more memory.&lt;br /&gt;
&lt;br /&gt;
The third solution comes to play by recognizing that in OLS, what you need is the SSCP and you can easily build up the SSCP over rolling time window by resorting to PROC EXPAND. This is demonstrated as Method[2] below. This approach will further improve the speed but still requires large amount of memory if the data is big and many rolling windows are generated. &lt;br /&gt;
&lt;br /&gt;
Since what we need to do is to build the SSCP matrix and obtain the coefficient estimates based on the informaiton in SSCP, we can certainly code this in a DATA Step using ADJUST operator, which provides a solution that is both fast and low memory occupancy. See [11] for an introduction to ADJUST operator. To make this even faster, a modification of ADJUST operator, the SWEEP operator, can be used. For an introduction to SWEEP operator, see [11], [12]. In the code below, Method[0] implements the ADJUST operator, while Method[1] implements the SWEEP operator. &lt;br /&gt;
&lt;br /&gt;
The experiment results are shown below: &lt;br /&gt;
&lt;br /&gt;
&lt;span style="background-color: #999999;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;span style="color: #666666;"&gt;&lt;b&gt;&lt;span style="color: black;"&gt;&amp;nbsp;&amp;nbsp; Real Time&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; CPU Time &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; | Memory&lt;/span&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/b&gt;&lt;/span&gt;&lt;/span&gt;&lt;br /&gt;
=====================================================&lt;br /&gt;
&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Method 0 |&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.01 (seconds)&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.01 (seconds)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; 611K &lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Method 1 |&amp;nbsp;&amp;nbsp; &amp;nbsp;0.25 (seconds)&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.24 (seconds)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; 432K &lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Method 2 |&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;1.61 (seconds)&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.94 (seconds)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; | 50381K &lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Method 3 |&amp;nbsp; 80.54 (seconds)&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp; 79.61 (seconds)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp; 2322K &lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white; color: red;"&gt;Method 4 |&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Failed&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Failed&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |&amp;nbsp;&amp;nbsp;&amp;nbsp; Failed &lt;/span&gt;&lt;br /&gt;
=====================================================&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Reference: &lt;/b&gt;&lt;br /&gt;
[1]MYSAS.NET, &lt;a href="http://www.mysas.net/forum/viewtopic.php?f=4&amp;amp;t=8070"&gt;http://www.mysas.net/forum/viewtopic.php?f=4&amp;amp;t=8070&lt;/a&gt; &lt;br /&gt;
[2]MYSAS.NET, &lt;a href="http://www.mysas.net/forum/viewtopic.php?f=4&amp;amp;t=7898"&gt;http://www.mysas.net/forum/viewtopic.php?f=4&amp;amp;t=7898&lt;/a&gt; &lt;br /&gt;
[3]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0604D&amp;amp;L=sas-l&amp;amp;P=R32485"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0604D&amp;amp;L=sas-l&amp;amp;P=R32485&lt;/a&gt; &lt;br /&gt;
[4]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0704C&amp;amp;L=sas-l&amp;amp;P=R3305"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0704C&amp;amp;L=sas-l&amp;amp;P=R3305&lt;/a&gt; &lt;br /&gt;
[5]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0802C&amp;amp;L=sas-l&amp;amp;P=R9746"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0802C&amp;amp;L=sas-l&amp;amp;P=R9746&lt;/a&gt; &lt;br /&gt;
[6]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0801C&amp;amp;L=sas-l&amp;amp;P=R14671"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0801C&amp;amp;L=sas-l&amp;amp;P=R14671&lt;/a&gt; &lt;br /&gt;
[7]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0810A&amp;amp;L=sas-l&amp;amp;P=R19135"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0810A&amp;amp;L=sas-l&amp;amp;P=R19135&lt;/a&gt; &lt;br /&gt;
[8]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0802C&amp;amp;L=sas-l&amp;amp;P=R13489"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0802C&amp;amp;L=sas-l&amp;amp;P=R13489&lt;/a&gt; &lt;br /&gt;
[9]Michael D Boldin, "Programming Rolling Regressions in SAS", Proceedings of NESUG, 2007 &lt;br /&gt;
[10]SAS-L, &lt;a href="http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0604D&amp;amp;L=sas-l&amp;amp;D=0&amp;amp;P=56926"&gt;http://www.listserv.uga.edu/cgi-bin/wa?A2=ind0604D&amp;amp;L=sas-l&amp;amp;D=0&amp;amp;P=56926&lt;/a&gt; &lt;br /&gt;
[11]J. H. Goodnight, "The Sweep Operator: Its Importance in Statistical Computing", SAS Tech Report R-106, 1978 &lt;br /&gt;
[12]Kenneth Lange, "Numerical Analysis for Statisticians", Springer, 1998 &lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
proc datasets library=work kill; run;


options fullstimer;
data test;
     do seq=1 to 500000;
          x1=rannor(9347957);
          *x2=rannor(876769)+0.1*x1;
          epsilon=rannor(938647)*0.5;
          y = 1.5 + 0.5*x1 +epsilon;
          output;
     end;
run;

/* Method 0.*/
sasfile test load;
data res0;
        set test;
  array _x{3,3} _temporary_ ;
  array _a{3,3} _temporary_ ;
  array _tempval{5, 20} _temporary_ ;
  m=mod(_n_-1, 20)+1;
  _tempval[1, m]=x1; 
  _tempval[2, m]=y;
  _tempval[3, m]=x1**2;
  _tempval[4, m]=x1*y;
  _tempval[5, m]=y**2;

  link filler;
  if _n_&amp;gt;=20 then do;
        if _n_&amp;gt;20 then do; 
                   m2=mod(_n_-20, 20)+1;
       _x[1,2]+(-_tempval[1, m2]);
       _x[1,3]+(-_tempval[2, m2]);
       _x[2,2]+(-_tempval[3, m2]);
       _x[2,3]+(-_tempval[4, m2]);
       _x[3,3]+(-_tempval[5, m2]);
     end;
        do i=1 to dim(_a, 1);
         do j=1 to dim(_a, 2);
          _a[i, j]=_x[i, j];
      end;
     end;
            
              do k=1 to dim(_a, 1)-1;
         link adjust;
              end;
     Intercept=_a[1,3]; beta=_a[2,3];
     keep seq   intercept  beta;
     output;
  end;

  return;
filler:
   _x[1,1]=20; _x[1,2]+x1; _x[1,3]+y;
   _x[2,2]+_tempval[3,m];  _x[2,3]+_tempval[4,m]; _x[3,3]+_tempval[5,m];
   _x[2,1]=_x[1,2]; _x[3,1]=_x[1,3]; _x[3,2]=_x[2,3]; 
return;

adjust:
    B=_a[k, k];
 do j=1 to dim(_a, 2);
     _a[k, j]=_a[k, j]/B;
 end;
 do i=1 to dim(_a, 1);
     if i ^=k then do;
          B=_a[i, k];
    do j=1 to dim(_a, 2);
        _a[i, j]=_a[i, j]-B*_a[k, j];
    end;
  end;
 end;
return;

run;
sasfile test close;



/* Method 1.*/

sasfile test load;
data rest0;
        set test;
  array _x{4} _temporary_;
  array _a{2,20}  _temporary_;
  m=mod(_n_-1, 20)+1;
  _a[1, m]=x1; _a[2,m]=y;
  link filler;

  m2=mod(_n_-20, 20)+1;
  if _n_&amp;gt;=20 then do;
    if _n_&amp;gt;20 then do;
              link deduct;
    end;
    beta=(_x[2]-_x[1]*_x[4]/20)/(_x[3]-_x[1]**2/20);
    intercept=_x[4]/20 - beta*_x[1]/20;
    keep  seq   intercept  beta  ;
    output;
  end;
  return;       
filler:
     _x[1]+x1;
  _x[2]+x1*y;
  _x[3]+x1**2;
  _x[4]+y;
return;
deduct:
     _x[1]=_x[1]-_a[1,m2]; 
  _x[2]=_x[2]-_a[1,m2]*_a[2,m2];
  _x[3]=_x[3]-_a[1,m2]**2;
  _x[4]=_x[4]-_a[2,m2];
return;
run;
sasfile test close;



/* Method 2.*/

%macro wrap;
%let window=20;
%let diff=%eval(&amp;amp;window-0);
data testv/view=testv;
     set test;
       xy=x1*y;  
run;

proc expand data=testv  method=none  out=summary(keep=seq sumxy  sumx1  sumy  ussx1  may  max);
       convert  x1=sumx1/transformout=(movsum &amp;amp;diff);
       convert  xy=sumxy/transformout=(movsum &amp;amp;diff);
       convert  x1=ussx1/transformout=(movuss &amp;amp;diff);
       convert  y =sumy /transformout=(movsum &amp;amp;diff);
       convert  y =may / transformout=(movave &amp;amp;diff);
       convert  x1 =max / transformout=(movave &amp;amp;diff);  
run;

data result1;
     set summary(firstobs=&amp;amp;window);
       beta = (sumxy - sumx1*sumy/&amp;amp;window)/(ussx1 - sumx1/&amp;amp;window.*sumx1);  
       alpha= may - beta*max;
       keep seq  beta  alpha; 
run;
%mend;

%let t0=%sysfunc(datetime(), datetime24.);
*options nosource nonotes;
%wrap;
options source notes;
%let t1=%sysfunc(datetime(), datetime24.);
%put Start @ &amp;amp;t0;
%put End   @ &amp;amp;t1;

 

/* Method 3.*/
%let t0=%sysfunc(datetime(), datetime.);
 
data test2v/view=test2v;
       set test;
       array _x{2, 20} _temporary_ (20*0 20*0);
       k=mod(_n_-1, 20)+1;
       _x[1, k]=x1; _x[2, k]=y;
       if _n_&amp;gt;=20 then do;
          do j=1 to dim(_x, 2);
               x=_x[1, j]; y=_x[2, j];
               output;
               keep seq x y;
            end;
       end;
run;

ods select none;
proc  reg data=test2v  outest=res2(keep=seq x intercept);
         by seq;
         model y = x;
run;quit;
ods select all;

%let t1=%sysfunc(datetime(), datetime.);
%put Start @ &amp;amp;t0;
%put End   @ &amp;amp;t1;


/* Method 4. */
%macro wrap;
options nonotes;
ods select none;
%do i=20 %to 500000;
       %let fo=%eval(&amp;amp;i-19);
       proc reg data=test(firstobs=&amp;amp;fo  obs=&amp;amp;i)  outest=_xres(keep=x1 intercept);
           model y =x1;
       run;quit;
      %if %eval(&amp;amp;i=20) %then %do;
          data res3; set _xres; run;
      %end;
      %else %do;
        proc append base=res3  data=_xres; run;
      %end;
%end;

ods select all;
data res3;
       set res3;
       time=19+_n_;
run;
options notes;
%mend;

%let t0=%sysfunc(datetime(), datetime.);
%wrap;
%let t1=%sysfunc(datetime(), datetime.);
%put Start @ &amp;amp;t0;
%put End   @ &amp;amp;t1;
&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-2743524723880802672?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/U17omOEqbEs" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/2743524723880802672/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=2743524723880802672" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2743524723880802672?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2743524723880802672?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/U17omOEqbEs/rolling-analysis-of-time-series.html" title="Rolling Window Regression of Time Series" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>1</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/08/rolling-analysis-of-time-series.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkUMSHs-eCp7ImA9WhdUEEo.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-660326104865544458</id><published>2011-06-21T00:25:00.005-04:00</published><updated>2011-09-26T16:51:29.550-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-09-26T16:51:29.550-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="quantile computing" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC STDIZE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC MEANS" /><title>Numerical variables profiling in very large data set</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/A2IRkQa8rLx80wyBtpUpQ87OeYA/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/A2IRkQa8rLx80wyBtpUpQ87OeYA/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/A2IRkQa8rLx80wyBtpUpQ87OeYA/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/A2IRkQa8rLx80wyBtpUpQ87OeYA/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Profiling numerical variables is an integral part of data analytics, which generally consists of obtaining standard descriptive statistics such as quantiles, first central moments as well as missing ratio.&lt;br /&gt;
&lt;br /&gt;
It is easily obtainable by using PROC MEANS (or PROC SUMMARY). But when we face very large data with many varibles, we will hit memory wall and very long processing time using default options in PROC MEANS. The most time consuming and memory intensive descriptive statistics is quantile calculation. The default method uses ordered statistics, which is the most accurate but also the most memory intensive and time consuming one. &lt;br /&gt;
&lt;br /&gt;
There are two methods available to handle this scenario, both uses the one-pass method of Jain R. and Chlamtac I [1]. This method is able to obtain fairly accurate estimate of quantiles between P25 and P75, but for quantiles outside this range, the estimates will be more rough but given very large sample, maybe acceptable. Another problem is that this estimator is sensitive to the distribution of underlying data [2]. There are newer methods available that are insensitive to the distribution, such as [3].&lt;br /&gt;
&lt;br /&gt;
In PROC MEANS, we can specify QMETHOD=P2 or QMETHOD=HIST to call this one pass method. Or we can use PROC STDIZE, specifying PCTLMTD=ONEPASS to invoke P2 method to calculate quantiles. Each has its pro and con.&lt;br /&gt;
&lt;br /&gt;
PROC MEANS:&lt;br /&gt;
PRO: Multithreaded / rich set of descriptive statistics;&lt;br /&gt;
CON: Only pre-defined quantiles, output data set is not user friendly, in need of further manipulation;&lt;br /&gt;
&lt;br /&gt;
PROC STDIZE:&lt;br /&gt;
PRO: quantiles of 0 to 100 are available, statistics output data is in a user friendly format;&lt;br /&gt;
CON: No multithreads, lack of higher central moments statistics, need to surpress data output explicitly;&lt;br /&gt;
&lt;br /&gt;
Of couse, due to the way the output statistics are organized from PROC STDIZE OUTSTAT=, it can be parallelized very easily, and by examining the actual CPU time, PROC STDIZE is more efficient than PROC MEANS. Besides, higher central moments can be calculated from obtained first 2 central moments and this is done in the small output data set.&lt;br /&gt;
&lt;br /&gt;
Below the difference between default method and the two new approaches is illustrated.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
options fullstimer;
data test;
        length id 8;
  array x{100} ;
  do id=1 to 1e5;
      do j=1 to dim(x); x[j]=rannor(0)*1.2; end;
   output;
   drop j;
  end;
run;


proc means data=test noprint q1  qmethod=os;
        var x1-x100;
  output  out=_mean
             mean=mean1-mean100
                   std=std_x1-std_x100
                   q1=q1_x1-q1_x100
                   p95=p95_x1-p95_x100;
run;

proc means data=test noprint q1  qmethod=p2   qmarkers=211;
        var x1-x100;
  output  out=_mean
             mean=mean1-mean100
                   std=std_x1-std_x100
                   q1=q1_x1-q1_x100
                   p95=p95_x1-p95_x100;
run;

ods select none;
proc stdize data=test  
                   out=_null_  outstat=_stat  pctlmtd=onepass  
       nmarkers=211
                   pctlpts=1 5  10  25 50  75  90  95   99;
         var x1-x100;
run;
ods select all;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
Take a look at the log:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-W-0RZ6RyMMQ/TgAc53aK0YI/AAAAAAAAAZI/iFpmcklpNhg/s1600/P2-quantiles.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://3.bp.blogspot.com/-W-0RZ6RyMMQ/TgAc53aK0YI/AAAAAAAAAZI/iFpmcklpNhg/s640/P2-quantiles.png" width="350" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Due to smaller sample size, the most gain is on the memory side. Default approach of PROC MEANS used 643MB memory, which specifying QMETHOD=P2, the memory usage reduced to only 7.3MB. The most memory efficient is PROC STDIZE with PCTLMTD=ONEPASS, only 0.68MB memory was used. &lt;br /&gt;
&lt;br /&gt;
We also examine the difference on quantile estimates using the Ordered Statistics (OS) method and P2 method in PROC MEANS. There are observable differences but not that significant:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-taEtFe3DPMU/TgAfPK-i1EI/AAAAAAAAAZM/S2EB1-DhJT4/s1600/P2-quantiles_diff.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/-taEtFe3DPMU/TgAfPK-i1EI/AAAAAAAAAZM/S2EB1-DhJT4/s640/P2-quantiles_diff.png" width="300" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
Reference:&lt;br /&gt;
[1]. Jain R. and Chlamtac I. (1985), "The Algorithm for Dynamic Calculation of Quantiles and Histograms Without Storing Observations," Communications of the ACM, 28(10), 1076–1085. &lt;br /&gt;
[2]. SAS/STAT(R) 9.2 User's Guide, Second Edition&lt;br /&gt;
[3]. Alsabti, Khaled; Ranka, Sanjay; and Singh, Vineet (1997), "A One-Pass Algorithm for Accurately Estimating Quantiles for Disk-Resident Data". L.C. Smith College of Engineering and Computer Science - Former Departments, Centers, Institutes and Projects. Paper 4. &lt;a href="http://surface.syr.edu/lcsmith_other/4"&gt;http://surface.syr.edu/lcsmith_other/4&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-660326104865544458?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/dfZtXEEs4Vg" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/660326104865544458/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=660326104865544458" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/660326104865544458?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/660326104865544458?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/dfZtXEEs4Vg/numerical-variables-profiling-in-very.html" title="Numerical variables profiling in very large data set" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-W-0RZ6RyMMQ/TgAc53aK0YI/AAAAAAAAAZI/iFpmcklpNhg/s72-c/P2-quantiles.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/06/numerical-variables-profiling-in-very.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0AAQXk9fSp7ImA9WhdSGUs.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-8354118659515520074</id><published>2011-06-07T09:25:00.027-04:00</published><updated>2011-07-29T13:49:00.765-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-07-29T13:49:00.765-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC HPMIXED" /><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLIMMIX" /><category scheme="http://www.blogger.com/atom/ns#" term="HPGLIMMIX" /><title>%HPGLIMMIX macro on large scale HMM</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/e3aJ0Fqh7R-yADIwvvFUBfHqBiQ/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/e3aJ0Fqh7R-yADIwvvFUBfHqBiQ/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/e3aJ0Fqh7R-yADIwvvFUBfHqBiQ/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/e3aJ0Fqh7R-yADIwvvFUBfHqBiQ/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;div align="left" class="MsoNormal" style="mso-pagination: widow-orphan; text-align: left;"&gt;PROC GLIMMIX is good tool for generalized linear mixed model (GLMM), when the scale is small to medium. When facing a large scale GLMM, such as modeling all ZIPs nested in Counties nested in all 51 States in US, a 64-bit machine with extremely large memory is required and the computing may last for months! In a strictly nested hierarchical model, the variance covariance matrix is very sparse, and taking advantage of this property can accelerate computing by many folds. &lt;br /&gt;
&lt;br /&gt;
The %HPGLIMMIX SAS macro is made for large scale Hierarchical Mixed Models. As an example, a sample data using Gamma Regression is shown below, with all ZIPs in AK, AL, AR, AZ with 2-level hierarchies: State and ZIP within State, total 4 blocks with max 693 columns per block. The reason not all ZIPs and all states are used is simply because PROC GLIMMI blows up on the machine. &lt;br /&gt;
&lt;br /&gt;
Copmaring the estimates and std errors from both runs, they are the same, but drastically different running time of 71sec using %HPGLIMMIX v.s. 35min39sec using GLIMMIX. &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

1715
1716  options nomprint nomlogic;
1717  %hpglimmix(data=temp2,
1718             stmts=%str(
1719                    class zip  zip_state;
1720                    model y = x ;
1721                    random int zip/subject=zip_state;
1722                      ),
1723             error=gamma,
1724             link=LOG,
1725             options=NOTEST);
NOTEST

       The HPGLIMMIX Macro

Data Set           : WORK.TEMP2
Error Distribution : GAMMA
Link Function      : LOG
Response Variable  : Y


Job Starts at : 06JUN2011:15:51:19
    HPGLIMMIX Iteration History

Iteration    Convergence criterion
    1            0.0081058432  13 sec
    2            0.0004213646  13 sec
    3            2.7137935E-7  13 sec
    4            3.1854799E-9  12 sec

Output from final Proc HPMixed run:
Job Ends at : 06JUN2011:15:52:30
1726  options nomprint nomlogic;
1727
1728  proc glimmix data=temp2;
1729       class zip  zip_state;
1730       model y = x /s  dist=gamma;
1731       random int zip /subject=zip_state;
1732  run;



NOTE: Convergence criterion (PCONV=1.11022E-8) satisfied.
NOTE: PROCEDURE GLIMMIX used (Total process time):
      real time           35:38.93
      cpu time            34:30.90

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
GLIMMIX output:&lt;/div&gt;&lt;div align="left" class="MsoNormal" style="mso-pagination: widow-orphan; text-align: left;"&gt;&lt;span lang="EN-US" style="font-family: 宋体; font-size: 12pt;"&gt;&lt;/span&gt;&lt;/div&gt;&lt;div style="background: #ebebeb; border-bottom: #999999 1pt dashed; border-left: #999999 1pt dashed; border-right: #999999 1pt dashed; border-top: #999999 1pt dashed; mso-border-alt: dashed #999999 .5pt; mso-element: para-border-div; padding-bottom: 3pt; padding-left: 3pt; padding-right: 3pt; padding-top: 3pt;"&gt;&lt;div align="left" class="MsoNormal" style="background: #ebebeb; border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none; mso-border-alt: dashed #999999 .5pt; mso-line-height-alt: 8.75pt; mso-padding-alt: 3.0pt 3.0pt 3.0pt 3.0pt; mso-pagination: widow-orphan; padding-bottom: 0cm; padding-left: 0cm; padding-right: 0cm; padding-top: 0cm; tab-stops: 45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt; text-align: left;"&gt;&lt;span lang="EN-US" style="color: #000001; font-family: 宋体; font-size: 12pt;"&gt;&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Covariance Parameter Estimates&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Standard&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cov Parm&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Subject&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Estimate&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Error&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Intercept&amp;nbsp;&amp;nbsp;&amp;nbsp; zip_state&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000152&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000125&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; zip&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; zip_state&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000066&amp;nbsp;&amp;nbsp;&amp;nbsp; 3.105E-6&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Residual&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000405&amp;nbsp;&amp;nbsp;&amp;nbsp; 2.281E-6&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Solutions for Fixed Effects&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Standard&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Effect&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Estimate&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Error&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;DF&amp;nbsp;&amp;nbsp;&amp;nbsp; t Value&amp;nbsp;&amp;nbsp;&amp;nbsp; Pr &amp;gt; |t|&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; Intercept&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 6.5873&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.006180&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3&amp;nbsp;&amp;nbsp;&amp;nbsp; 1065.95&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;lt;.0001&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; x&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.003436&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000218&amp;nbsp;&amp;nbsp;&amp;nbsp; 62634&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 15.79&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;lt;.0001&lt;/span&gt;&lt;span lang="EN-US" style="color: #000001; font-family: &amp;quot;Lucida Console&amp;quot;; font-size: 7.5pt;"&gt;&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div align="left" class="MsoNormal" style="mso-pagination: widow-orphan; text-align: left;"&gt;&lt;span lang="EN-US" style="font-family: 宋体; font-size: 12pt;"&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
%HPGLIMMIX output:&lt;/span&gt;&lt;/div&gt;&lt;div style="background: #ebebeb; border-bottom: #999999 1pt dashed; border-left: #999999 1pt dashed; border-right: #999999 1pt dashed; border-top: #999999 1pt dashed; mso-border-alt: dashed #999999 .5pt; mso-element: para-border-div; padding-bottom: 3pt; padding-left: 3pt; padding-right: 3pt; padding-top: 3pt;"&gt;&lt;div align="left" class="MsoNormal" style="background: #ebebeb; border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none; mso-border-alt: dashed #999999 .5pt; mso-line-height-alt: 8.75pt; mso-padding-alt: 3.0pt 3.0pt 3.0pt 3.0pt; mso-pagination: widow-orphan; padding-bottom: 0cm; padding-left: 0cm; padding-right: 0cm; padding-top: 0cm; tab-stops: 45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt; text-align: left;"&gt;&lt;span lang="EN-US" style="color: #000001; font-family: 宋体; font-size: 12pt;"&gt;&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; _cov data from&amp;nbsp; HPGLIMMIX&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Obs&amp;nbsp;&amp;nbsp;&amp;nbsp; CovParm&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Subject&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Estimate&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Intercept&amp;nbsp;&amp;nbsp;&amp;nbsp; zip_state&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000152&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; zip&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; zip_state&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000066&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Residual&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000405&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; _soln data from HPGLIMMIX&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Obs&amp;nbsp;&amp;nbsp;&amp;nbsp; Effect&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Estimate&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; StdErr&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Intercept&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 6.5873&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.006180&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; x&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.003436&amp;nbsp;&amp;nbsp;&amp;nbsp; 0.000218&lt;/span&gt;&lt;span lang="EN-US" style="color: #000001; font-family: &amp;quot;Lucida Console&amp;quot;; font-size: 7.5pt;"&gt;&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div align="left" class="MsoNormal" style="mso-pagination: widow-orphan; text-align: left;"&gt;&lt;span lang="EN-US" style="font-family: 宋体; font-size: 12pt;"&gt;&lt;img height="1" src="file:///C:/DOCUME~1/MikeX/LOCALS~1/Temp/msohtml1/01/clip_image001.gif" width="1" /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div class="MsoNormal"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-8354118659515520074?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/jnm4OPM160s" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/8354118659515520074/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=8354118659515520074" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8354118659515520074?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8354118659515520074?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/jnm4OPM160s/hpglimmix-sas-macro.html" title="%HPGLIMMIX macro on large scale HMM" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>4</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/06/hpglimmix-sas-macro.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DEYHSHs9eyp7ImA9WhZWGU4.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-7500982122418019594</id><published>2011-04-10T18:57:00.008-04:00</published><updated>2011-05-20T20:02:19.563-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-05-20T20:02:19.563-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC DISCRIM" /><category scheme="http://www.blogger.com/atom/ns#" term="Moore-Penrose pseudoinverse" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Manipulation" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>Regularized Discriminant Analysis</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/E5E-0VE-5m-ZNtc8CxACYa5G4oE/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/E5E-0VE-5m-ZNtc8CxACYa5G4oE/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/E5E-0VE-5m-ZNtc8CxACYa5G4oE/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/E5E-0VE-5m-ZNtc8CxACYa5G4oE/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Demo SAS implementation of Regularized (Linear) Discriminate Analysis of J. Friedman (1989)[1]. Simpler introduction can be found at [2]. Regularized QDA follows similarly.&lt;br /&gt;
&lt;br /&gt;
To save coding, I called R within SAS to finish the computation. For details to see how to call R within SAS, check &lt;a href="http://www.sas-programming.com/2010/04/conduct-r-analysis-within-sas.html"&gt;here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-C0I93CbVbGw/TaI13zRVwlI/AAAAAAAAAYU/UnUOkw2DC48/s1600/RDA.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="488" src="http://4.bp.blogspot.com/-C0I93CbVbGw/TaI13zRVwlI/AAAAAAAAAYU/UnUOkw2DC48/s640/RDA.png" width="640" /&gt;&lt;/a&gt;&lt;a href="http://4.bp.blogspot.com/-JSvECabBASU/TdcA0Wvql6I/AAAAAAAAAY4/9COUM1J8WTY/s1600/RDA2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="488" src="http://4.bp.blogspot.com/-JSvECabBASU/TdcA0Wvql6I/AAAAAAAAAY4/9COUM1J8WTY/s640/RDA2.png" style="cursor: move;" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;


&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
Reference:&lt;br /&gt;
1. Friedman, J. (1989). Regularized discriminant analysis, Journal of the American Statistical Association 84: 165-175.&lt;br /&gt;
2. Friedman, J; Hastie, T; Tibshirani, R (2008). The Elements of Statistical Learning, section 4.3.1&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.amazon.com/Elements-Statistical-Learning-Prediction-Statistics/dp/0387848576?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387848576&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387848576" style="border: none !important; margin: 0px !important; padding: 0px !important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-7500982122418019594?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/H5AE7AesYwc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/7500982122418019594/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=7500982122418019594" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/7500982122418019594?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/7500982122418019594?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/H5AE7AesYwc/regularized-discriminant-analysis.html" title="Regularized Discriminant Analysis" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-C0I93CbVbGw/TaI13zRVwlI/AAAAAAAAAYU/UnUOkw2DC48/s72-c/RDA.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/04/regularized-discriminant-analysis.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0AFQ384cCp7ImA9WhdQEUs.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-194600123265301116</id><published>2011-04-01T09:11:00.009-04:00</published><updated>2011-08-12T11:08:32.138-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-08-12T11:08:32.138-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC APPEND" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLMSELECT" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC STANDARD" /><title>ElasticNet in SAS</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/A6ZF4RrzKJAbgmxZRmUg3U65POA/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/A6ZF4RrzKJAbgmxZRmUg3U65POA/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/A6ZF4RrzKJAbgmxZRmUg3U65POA/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/A6ZF4RrzKJAbgmxZRmUg3U65POA/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-15408I97SUo/TaIv9lY76tI/AAAAAAAAAYQ/p18MWxnHdYI/s1600/ELASTIC-NET-IN-SAS.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="483" src="http://3.bp.blogspot.com/-15408I97SUo/TaIv9lY76tI/AAAAAAAAAYQ/p18MWxnHdYI/s640/ELASTIC-NET-IN-SAS.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Try out Elastic Net [1] in normal linear regression, using Naive algorithm. Exploring possibilities for GLM Elastic Net in SAS.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;pre style="background-color: #ebebeb; border-bottom: rgb(153,153,153) 1px dashed; border-left: rgb(153,153,153) 1px dashed; border-right: rgb(153,153,153) 1px dashed; border-top: rgb(153,153,153) 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;


&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
1. Zou, H and Hastie, T (2005). Regularization and variable Selection via the Elastic Net, Journal Of The Royal Statistical Society Series B.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-194600123265301116?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/DKF3DAsCHOs" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/194600123265301116/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=194600123265301116" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/194600123265301116?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/194600123265301116?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/DKF3DAsCHOs/elasticnet-in-sas.html" title="ElasticNet in SAS" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-15408I97SUo/TaIv9lY76tI/AAAAAAAAAYQ/p18MWxnHdYI/s72-c/ELASTIC-NET-IN-SAS.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/04/elasticnet-in-sas.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkYFRn4zfSp7ImA9Wx9UE0U.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-4705248398300447375</id><published>2011-02-01T10:47:00.038-05:00</published><updated>2011-02-10T18:21:57.085-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-02-10T18:21:57.085-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC GPLOT" /><category scheme="http://www.blogger.com/atom/ns#" term="Bayesian" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Manipulation" /><category scheme="http://www.blogger.com/atom/ns#" term="GRAPH" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC UNIVARIATE" /><title>Bayesian Computation with SAS (2)</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/RdpNKJImQbgBVRwLayJpey0YjYM/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/RdpNKJImQbgBVRwLayJpey0YjYM/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/RdpNKJImQbgBVRwLayJpey0YjYM/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/RdpNKJImQbgBVRwLayJpey0YjYM/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;In Chapter 2 of the book "&lt;a href="http://bayes.bgsu.edu/bcwr/"&gt;Bayesian Computation with R&lt;/a&gt;" by &lt;a href="http://bayes.bgsu.edu/"&gt;Jim Albert&lt;/a&gt;, the philosoph of Bayesian statistics is introduced using an example where a parameter regarding proportion of heavy sleeping college students, i.e. those have more than 8 hours sleep a day, is studied. According to Bayes'Rule, the posterior is proportional to the product of prior probability and likelihood given prior, where the proportion factor is the likelihood of data over the whole parameter space. As we can see, the key point here is the specification of prior probability.&lt;br /&gt;
&lt;br /&gt;
In order to illustrate this idea, Jim introduced three types of prior probability:&lt;br /&gt;
1. Prior probability over a discrete grid of choice, given a weight which serves as hyper-parameter;&lt;br /&gt;
2. Prior probability by a chosen probability. In most cases, the chosen probability is a so called conjugate prior becuase the posterior is the same family as the prior. In the book, a beta probability is chosen and it is conjugate in the sense that the posterior is also a beta distribution, only with different parameter;&lt;br /&gt;
3. A dense grid of prior probability choices which is called histogram prior;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
Using a discrete grid of prior choices, we first generate a list of prior probabilities, in data set PRIOR, that we believe the true proportion of heavy sleepers should be, and assign a weight, i.e. the magnitude of credibility, for each choice in a data set P, which serves as hyper-parameter. We then consolidate the two data sets into one and plot the prior against its corresponding weight.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/* Chapter 2 of Bayesian Computation with SAS */
/* In this chapter, Jim talked about applying 
   Bayesian Rule to learn posterior probability
   using an example where the proportion of 
   students sleeping over 8 hours a day is studies.
   Bayes Rule:
     Posterior(p|data) \prop Prior(p)* likelihood(data|p)
*/
options fullstimer formchar='|----|+|---+=|-/\&amp;lt;&amp;gt;*'  error=3;
/***************************************************/
data prior;
     input prior @@;
cards;
2 4 8 8 4 2 1 1 1 1
;
run;

data p;
     do p=0.05 to 0.95 by 0.1;
     output;
  end;
run;

data p;
     retain sum_prior 0;
     do until (eof1);
        set prior end=eof1;
        sum_prior+prior;
     end;
     do until (eof2);
        merge p  prior  end=eof2;
        prior=prior/sum_prior;
        drop sum_prior;
        output;
     end;
run;

goptions reset=all;
symbol  interpol=Needle  value=none  line=1 color=black;
axis1   label=(angle=90  'Prior Probability') order=(0 to 0.3 by 0.1) minor=none;
axis2   label=('p')   order=(0 to 1 by 0.2)  minor=none;
proc gplot data=p;
      plot prior*p /vaxis=axis1  haxis=axis2;
run;quit;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fXUNI9dI/AAAAAAAAAXM/_Fl_SCGyDc0/s1600/chapter2_1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fXUNI9dI/AAAAAAAAAXM/_Fl_SCGyDc0/s400/chapter2_1.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
Next, we calculate the posterior probability given prior and likelihood conditional on prior. In the code below, we observe 11 heavy sleepers out of a total sample of 27. The first DATA STEP calculates the likelihood conditional on each prior choice, the posterior is output in the data set P2, where the normalization factor is the sum of conditional likelihood, scaled by max likelihood for numerical stability. You can observe the shape of posterior against hyper-parameter using GPLOT. The INTERPOL=NEEDLE option in SYMBOL statement minic the histogram style line of PLOT function in R.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
%let n0=16;
%let n1=11;

data p;      
     set p  end=eof1  nobs=ntotal;
     if p&amp;gt;0 &amp;amp; p&amp;lt;1 then do;
        log_like=log(p)*&amp;amp;n1 + log(1-p)*&amp;amp;n0;   
     end;
     else 
        log_like=-999*(p=0)*(&amp;amp;n1&amp;gt;0) + (p=1)*(&amp;amp;n0&amp;gt;0);   
     log_posterior=log(prior) + log_like; 
     if log_posterior&amp;gt;max_like then max_like=log_posterior;     
run;

proc means data=p  noprint;
     var log_posterior;
     output out=_max  max(log_posterior)=max;
run;

data p2;
     set _max;
     sum_post=0;
     do until (eof);
        set p  end=eof;
        sum_post+exp(log_posterior-max);         
     end;
     do until (eof2);
        set p end=eof2;
        posterior=exp(log_posterior-max)/sum_post;
        output;
        keep p prior  posterior;
     end;
run;

goptions reset=all;
symbol value=NONE interpol=Needle;
axis1  label=(angle=90  'Posterior Probability');
axis2  label=('p');
proc gplot data=p2;
      plot  posterior*p /vaxis=axis1  haxis=axis2;
run;quit; 

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TU_fhKE0pZI/AAAAAAAAAXQ/C9X7QTCGVVA/s1600/chapter2_2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TU_fhKE0pZI/AAAAAAAAAXQ/C9X7QTCGVVA/s400/chapter2_2.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
After tried discrete grid of prior, beta conjugate prior is introduced. Jim mentioned that by matching the believed 50% and 90% percentiles, the parameter of prior beta distribution can be obtained by try-and-error approach. We note that the believed percentiles data indicates that the beta distribution skewed to 0, so that parameter alpha is less than parameter beta and we can use a binary search to nail down the values, which I came down to approximately 3.4 and 7.5. But I will use the {3.4, 7.4} vector as in the book. Since beta distribution is a conjugate prior for proportion, the computation is very easy. The curves for prior, likelihood and posterior are visualized.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/* beta prior */

%let a=3.4;
%let b=7.4;
%let s=11;
%let f=16;

data p;
      do i=1 to 500;
      p=i/500;
   prior=pdf('beta', p, &amp;amp;a, &amp;amp;b);
   like=pdf('beta', p, &amp;amp;s, &amp;amp;f);
   post=pdf('beta', p, &amp;amp;a+&amp;amp;s, &amp;amp;b+&amp;amp;f);
   drop i;
   output;
   end;
run;

goptions reset=all;
symbol1 interpol=j  color=red  width=1  value=none;
symbol2 interpol=j  color=blue width=2  value=none;
symbol3 interpol=j  color=gree width=3  value=none;
axis1   label=(angle=90  'Density')  order=(0 to 5 by 1) minor=none;
axis2   label=('p')      order=(0 to 1 by 0.2)  minor=none;;
legend label=none   position=(top right inside)  mode=share;
proc gplot data=p;
      plot post*p   prior*p  like*p  /overlay  
                                      vaxis=axis1  
                                      haxis=axis2  
                                      legend=legend;
run;quit;

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TU_fl83c-yI/AAAAAAAAAXU/L2ZJd4zwgAY/s1600/chapter2_3.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TU_fl83c-yI/AAAAAAAAAXU/L2ZJd4zwgAY/s400/chapter2_3.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Since we obtain closed form for posterior distribution, we can conduct inference based on exact calculation; alternatively, we can use simulated random sample.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/* generate 1000 random obs from posterior distribution */
data rs;
      call streaminit(123456);
      do i=1 to 1000;
         x=rand('beta', &amp;amp;a+&amp;amp;s, &amp;amp;b+&amp;amp;f);
   output;
   end;
run;

proc univariate data=rs  noprint;
     var x;
     histogram /midpoints=0 to 0.8 by 0.05  
                cbarline=blue  cfill=white  
                outhistogram=hist;
     title "Histogram of ps";
run;
title;

axis1 label=(angle=90 "Frequency");
proc gbarline data=hist;
      bar _midpt_/discrete  sumvar=_count_  space=0  axis=axis1 ;
   title "Histogram of ps";
run;
title;

/* hist prior */
data midpt;
     do pt=0.05 to 0.95 by 0.1;
        output;
     end;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fqmqV0HI/AAAAAAAAAXY/HZEFgLHTSHM/s1600/chapter2_4.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fqmqV0HI/AAAAAAAAAXY/HZEFgLHTSHM/s400/chapter2_4.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
In the book, Jim also mentioned using so called Histogram prior. The Histogram prior is a densed-version of discrete prior, with the distance between discrete choices filled in with dense choices of the same value, separated only by a small distance. It provides a more refined description of prior believe than the discrete prior. We plot the prior and posterior as well as random sample from posterior to give a visual impression. Note that we use PROC STDIZE method=SUM to normalize prior variable.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
data prior;
      input prior @@;
datalines;
2 4 8 8 4 2 1 1 1 1
;
run;
proc stdize data=prior  method=sum  out=prior; var prior; run;

data _null_;
     set midpt nobs=ntotal1;
     call symput('nmid', ntotal1);
     set prior nobs=ntotal2;
     call symput('nprior', ntotal2);
     stop;
run;

data histprior;
      array _midpt{&amp;amp;nmid} _temporary_;
      array _prior{&amp;amp;nprior} _temporary_;
      if _n_=1 then do;
         k=1;
         do until (eof1); 
            set midpt  end=eof1;
            _midpt[k]=pt; if k=2 then lo=_midpt[k]-_midpt[k-1];
            k+1;
         end;
         do k=1 to &amp;amp;nmid;
            _midpt[k]=round(_midpt[k]-lo/2, 0.00001);
            put _midpt[k]=;
         end; 
         k=1;
         do until (eof2);
            set prior  end=eof2;
            _prior[k]=prior;
            put k=  _prior[k]=;
            if k&amp;lt;&amp;amp;nprior then k=k+1;     
         end;
         drop k  lo;   
      end;
   
      do p=1/500 to 1 by 1/500;
         sk=0;
         do k=1 to &amp;amp;nmid;
            if p&amp;gt;=_midpt[k] then sk=min(&amp;amp;nmid, sk+1); 
         end;
         histprior=_prior[sk];
         output;
     end;
     stop;
run;


goptions reset=all;
symbol interpol=hiloj;
axis1  label=(angle=90  "Prior Density")  order=(0 to 0.3 by 0.05);
axis2  label=("p")                        order=(0 to 1   by 0.2);
proc gplot data=histprior;
      plot histprior*p /vaxis=axis1  haxis=axis2;
run;quit;

data histposterior;
     set histprior;
     like=pdf('BETA',p,  &amp;amp;s+1, &amp;amp;f+1);
     post=like*histprior;
run;


%put &amp;amp;s;
goptions reset=all;
symbol interpol=hiloj;
axis1 label=(angle=90  "Posterior Density")  order=(0 to 1 by 0.2);
proc gplot data=histposterior;
      plot   post*p/vaxis=axis1;
run;quit;

proc means data=histposterior  sum;
     var post;
run;

proc stdize data=histposterior  method=sum  out=data1;
      var post;
run;
proc means data=data1  sum;
      var post;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fwMIA0cI/AAAAAAAAAXc/RjKhhskbPDw/s1600/chapter2_5.png" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="320" src="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fwMIA0cI/AAAAAAAAAXc/RjKhhskbPDw/s320/chapter2_5.png" width="245" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TU_fyvce8UI/AAAAAAAAAXg/-vulwJpkauM/s1600/chapter2_6.png" imageanchor="1" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"&gt;&lt;img border="0" height="320" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TU_fyvce8UI/AAAAAAAAAXg/-vulwJpkauM/s320/chapter2_6.png" width="245" /&gt;&lt;/a&gt; &lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_f0Yb6fHI/AAAAAAAAAXk/ZiLD44-8Ako/s1600/chapter2_7.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;/a&gt;&lt;/div&gt;In R, the SAMPLE function is able to do a URS sampling with given probability. SAS's PROC SURVEYSELECT seems can't do this with method=URS. But URS sampling with given probability is very easy to code in SAS DATA STEP with the help of FORMAT. &lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
data fmt;
     merge histposterior(rename=(post=start))  
     histposterior(firstobs=2   in=_2    rename=(post=end));
     if _2;
run;

data fmt;
     set data1   end=eof;
     retain start;
     retain fmtname 'ursp'  type 'n';
     if _n_=1 then start=post;
     else do;
        end=start+post;  label=p;
        keep  fmtname  start  end  label;
        output;
        start=end;
     end;
     if eof then do;
        hlo='O';
        label=.;
        output;           
     end;
run;

proc sort data=fmt  nodupkey  out=fmt; 
     by start end;
run;

proc format cntlin=fmt; run;

data  samp;
      do i=1 to 1000;
         p=input(put(ranuni(78686), ursp.), best.);
         output;
      end;
run;

proc univariate data=samp noprint;
     title "Histogram of ps";
     histogram  p/midpoints=0.15 to 0.65 by 0.05  
                  outhist=samp_hist  
                  vaxislabel="Frequency"  vscale=count;      
run;
title;
&lt;/code&gt;&lt;/pre&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_f0Yb6fHI/AAAAAAAAAXk/ZiLD44-8Ako/s1600/chapter2_7.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;br /&gt;
&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_f0Yb6fHI/AAAAAAAAAXk/ZiLD44-8Ako/s1600/chapter2_7.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_f0Yb6fHI/AAAAAAAAAXk/ZiLD44-8Ako/s400/chapter2_7.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TU_gHQs8feI/AAAAAAAAAXo/8AsQJdTsK9E/s1600/chapter2_8.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;br /&gt;
&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Once we obtain posterior distribution of a variable, we can easily predict the value of some statistics given new samples. In the book, depending on the choice of weighting density, two types of predictions are discussed: 1. prior prediction; 2. posterior prediction. The calculation is straight-forward given formula.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/* prediction */
data prior;
     input prior @@;
cards;
2 4 8 8 4 2 1 1 1 1
;
run;

data p;
     do p=0.05 to 0.95 by 0.1;
     output;
  end;
run;

data p;
      retain sum_prior 0;
   do until (eof1);
      set prior end=eof1;
   sum_prior+prior;
   end;
   do until (eof2);
      merge p  prior  end=eof2;
   prior=prior/sum_prior;
   drop sum_prior;
   output;
   end;
run;

/* prediction using discrete prior */
%let m=20;
data pred;
        do ys=0 to &amp;amp;m;         
      pred=0;
            do i=1 to 10;
                set p   point=i;  
    pred = pred + prior * pdf('BINOMIAL', ys, p, &amp;amp;m);
    put p=  prior=  pred=  ;
   end;
   output;   
  end;
  stop;
run;

/* prediction using beta(a, b) prior */
%let m=20;
%let a=3.4;
%let b=7.4;
data pred;
     do ys=0 to &amp;amp;m;
       pred=comb(&amp;amp;m, ys) *exp(logbeta(&amp;amp;a + ys, &amp;amp;b+&amp;amp;m -ys) - logbeta(&amp;amp;a, &amp;amp;b));
       output;
     end;
run;

/* prediction by simulation */
%let n=1000;
data pred;
        do i=1 to &amp;amp;n;
      p=rand('BETA', &amp;amp;a, &amp;amp;b);
   ys=rand('BINOMIAL', p, &amp;amp;m);
   drop i;
   output;
    end;
run;

proc freq data=pred  noprint;
        table ys /out=y_freq;
run;
/*
proc gbarline data=y_freq;
        bar y /discrete sumvar=percent;
run;quit;
*/
data y_freq;
        set y_freq;
  percent=percent/100;
  label ys='y';
run;

goptions reset=all  border;
axis1  label=(angle=90  "Predictive Probability")  order=(0 to 0.14 by 0.02);
axis2  label=("y")   order=(0 to 16) offset=(15 points, 15 points) minor=none;
symbol  interpol=needle;
proc gplot data=y_freq;
        plot  percent * ys/vaxis=axis1  haxis=axis2  ;
run;quit;

/* summarize discrete outcome with given coverage probability */
/* input dsn is y_freq from PROC FREQ. The following can be used as gauge.
data y_freq;
        input y percent;
datalines;
0  0.013
1  0.032
2  0.065
3  0.103
4  0.102
5  0.115
6  0.114
7  0.115
8  0.095
9  0.083
10 0.058
11 0.036
12 0.029
13 0.014
14 0.015
15 0.006
16 0.005
;
run;
*/
%let covprob=0.9;
proc sort data=y_freq  out=y_freq2;
        by  percent;
run;

data y_freq2;
        set y_freq2;
  retain cumprob  0;
  cumprob+percent;
  if cumprob&amp;lt;(1-&amp;amp;covprob) then select=0;
  else select=1;
run;

proc means data=y_freq2  sum;
       where select=1;
    var percent;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TU_gHQs8feI/AAAAAAAAAXo/8AsQJdTsK9E/s1600/chapter2_8.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TU_gHQs8feI/AAAAAAAAAXo/8AsQJdTsK9E/s400/chapter2_8.png" width="307" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;﻿In Summary:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;0. There different choices for specifying Prior believe that can be incorporated into Bayesian computation:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&amp;nbsp;&amp;nbsp; a.) A discrete grid of possible values, each value with some probability&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&amp;nbsp;&amp;nbsp; b.) An appropriate distribution, where the parameters is determined from existing information&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&amp;nbsp;&amp;nbsp; c.) A dense grid of possible values that span the whole range&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;1. We demonstrated calculation under these three scenarios and their impact on posterior distribution of statistics of interests;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;2. We can conduct Sampling With Replacement with given sampling probability by using FORMAT and uniform random variable generator&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;3. We demonstrated that prediction in a Bayesian framework is straighforward, but depending on whether we use prior distribution or posterior distribution&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;a href="http://www.amazon.com/Bayesian-Computation-R-Use/dp/0387922970?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="Bayesian Computation with R (Use R)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387922970&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387922970" style="border: medium none; margin: 0px; padding: 0px ! important;" width="1" /&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: left;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-4705248398300447375?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/1wEPb0UVKbE" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/4705248398300447375/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=4705248398300447375" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/4705248398300447375?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/4705248398300447375?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/1wEPb0UVKbE/bayesian-computation-with-sas-2.html" title="Bayesian Computation with SAS (2)" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/_slrAR0IXTL0/TU_fXUNI9dI/AAAAAAAAAXM/_Fl_SCGyDc0/s72-c/chapter2_1.png" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/02/bayesian-computation-with-sas-2.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkEMRXc6fSp7ImA9Wx9UEEU.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-6025913615236309735</id><published>2011-01-04T07:18:00.021-05:00</published><updated>2011-02-07T07:11:24.915-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-02-07T07:11:24.915-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC GPLOT" /><category scheme="http://www.blogger.com/atom/ns#" term="Bayesian" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC REG" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Manipulation" /><category scheme="http://www.blogger.com/atom/ns#" term="GRAPH" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC UNIVARIATE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC MEANS" /><title>Bayesian Computation with SAS (1).</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/6MqfFmL2uI5nQClc7tJu26QmDX8/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/6MqfFmL2uI5nQClc7tJu26QmDX8/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/6MqfFmL2uI5nQClc7tJu26QmDX8/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/6MqfFmL2uI5nQClc7tJu26QmDX8/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;The book "&lt;a href="http://bayes.bgsu.edu/bcwr/"&gt;Bayesian Computation with R&lt;/a&gt;" by &lt;a href="http://bayes.bgsu.edu/"&gt;Jim Albert&lt;/a&gt; is an easy to read entry level book on applied Bayesian Statistics. While the book was written for R users, it is not difficult to translate the languages between R and SAS and I believe it is a good way to show Bayesian capability of SAS. In the next several months, I am going to translate all R code in the book into SAS. Readers are encouraged to buy this book to understand what's behind the code. The posts here will only spend minimum effects to explain the statistics underlying while most resource will still be denoted to&amp;nbsp;SAS coding.&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;This post will cover Chapter 1.&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;We first follow section 1.2.2, using PROC IMPORT to read in TAB deliminated file, which corresponds to read.table(file, sep='\t', header=T)&amp;nbsp;&amp;nbsp;in R. We also designated PDF as Output Destination.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/* Chapter 1 of Bayesian Computation with R */

%let folder=C:\Documents\SAS for Bayesian Compu with R;
%let datafolder=&amp;amp;folder\data;

libname data "&amp;amp;datafolder";
options missing='.' formchar = "|----|+|---+=|-/\&amp;lt;&amp;gt;*"  errors=2 fullstimer;

/*@@@@@@ Section 1.2.2 @@@@@*/
/*--&amp;gt; Read Tab Deliminated Data into SAS */
proc import datafile="&amp;amp;datafolder/studentdata.txt"  out=student dbms=dlm   replace;
      datarow=2;       /* redundent with GETNAMES= statement */
   guessingrows=2;  /* Only use first 2 rows of data to guess the type of data */
      delimiter='09'x;  /* this is redudent if we specify DBMS=TAB in PROC IMPORT option */
   getnames=YES;  /* This is the same as 'header=True' in R's table.read function */
run;

ods pdf file="&amp;amp;folder/Chapter1.pdf";
/*--&amp;gt;  To see Variable Names and print first row of this data 
   Use SPLIT="*" to specify the split character being "*", which controls 
   line breaks in column headings
*/
proc print data=student(obs=1)  split="*"; run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;SAS output looks like:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBo0vqOsI/AAAAAAAAAVo/i7CBF565L1E/s1600/section122.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="171" n4="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBo0vqOsI/AAAAAAAAAVo/i7CBF565L1E/s640/section122.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;Section 1.2.3, summarize frequency tables and use Barplot to visualize the counts. &lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;PROC FREQ directly corresponds to table() function in R, but provides much richer functionality. PROC GBARLINE is used to answer what barplot() does in R. The LABEL= statement in SAS is a very handy tool for annotation purpose for PRINT or GRAPH. &lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;PROC MEANS &lt;i&gt;&lt;u&gt;somehow&lt;/u&gt;&lt;/i&gt; corresponds to summary() function to obtain summary statistics for quantitative variables. You can sepecify MAXDEC= to keep desired number of decimals in print out.&lt;br /&gt;
Histogram can be obtained in SAS via either PROC UNIVARIATE or newly introduced PROC SGPLOT. PROC UNIVARIATE provides more control though. As shown below, you can specify desired mid-points while you can't do so in SGPLOT. On the other hand, SGPLOT is very convenient if you want to overlay density curve, either a normal fit or kernel one.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

/*@@@@ section 1.2.3 @@@*/
/*--&amp;gt; To summarize and graph a single Batch */
proc freq data=student;
     table Drink;
run;

/*--&amp;gt; Barplot on single variable */
title "Original idea, summarize and plot";
proc freq data=student  noprint;
      table Drink /out=t;
run;
goptions reset=all noborder;
proc gbarline data=t;
      bar  Drink /discrete space=4  sumvar=count;
run;quit;
title;
/*--&amp;gt; Actually, in SAS, it is better to directly use 
    PROC GBARLINE */
goptions reset=all noborder;
title "In SAS, PROC GBARLINE directly summarize";
proc gbarline data=student;
      bar Drink /discrete space=3 ;
run;quit;
title;

data student;
      set student;
   hours_of_sleep=WakeUp-ToSleep;
   /* Label can be useful for annotation purpose */
   label hours_of_sleep="Hours Of Sleep"; 
run;

proc means data=student 
           /* Request same statistics as in the book */
           min q1 median q3 max  nmiss             
           /* MAXDEC= specifies keeping 2 decimals */
           maxdec=2;                          
    var hours_of_sleep;
run;

/*--&amp;gt; Histogram using PROC UNIVARIATE */
proc univariate data=student  noprint;
      var hours_of_sleep;
   histogram /midpoints=2.5 to 12.5 by 1;
run;

/* new SGPLOT is a handy way to draw histogram and density */
proc sgplot data=student;
      histogram hours_of_sleep/scale=count;     
      density   hours_of_sleep/scale=count  type=normal;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;Compare SAS outputs:&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="border: medium none; clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBqcNdcDI/AAAAAAAAAVs/KbmqBMhB6Uo/s1600/section123.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="265" n4="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBqcNdcDI/AAAAAAAAAVs/KbmqBMhB6Uo/s400/section123.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="border: medium none;"&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="border: medium none; clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBrRdxQUI/AAAAAAAAAVw/1ID57AVhQr8/s1600/section123a.PNG" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="320" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBrRdxQUI/AAAAAAAAAVw/1ID57AVhQr8/s320/section123a.PNG" width="290" /&gt;&lt;/a&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBsQQJ3II/AAAAAAAAAV0/0bCWTbkOkSo/s1600/section123b.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="320" n4="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBsQQJ3II/AAAAAAAAAV0/0bCWTbkOkSo/s320/section123b.PNG" width="236" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;img border="0" height="175" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBtbSrjKI/AAAAAAAAAV4/0Uh6g4YTRoE/s400/section124.PNG" width="400" /&gt;&lt;/div&gt;&lt;div class="separator" style="border: medium none; clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBu2iYgLI/AAAAAAAAAWA/9LirnJrNcK0/s1600/section124b.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="228" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBu2iYgLI/AAAAAAAAAWA/9LirnJrNcK0/s320/section124b.PNG" width="320" /&gt;&lt;/a&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBudSqIYI/AAAAAAAAAV8/zBF7VguiXxY/s1600/section124a.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="241" n4="true" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBudSqIYI/AAAAAAAAAV8/zBF7VguiXxY/s320/section124a.PNG" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="border: medium none;"&gt;&lt;/div&gt;Boxplot to visualize distribution of&amp;nbsp; quantitative variables by some classification variable. Here, formula such as var1*var2 style corresponds to var1~var2 style in R.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

/*@@@ section 1.2.4 @@@@*/
/*---&amp;gt; Compare Batches using Boxplot */
proc sort data=student out=studentsorted; by Gender; run;
proc boxplot data=studentsorted;
      plot  hours_of_sleep*Gender;
run;   

proc means data=student  
            min q1 median q3 max nmiss  
            nway  maxdec=2;
     class Gender;
  var  Haircut;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;SAS output looks like:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBvlAimbI/AAAAAAAAAWE/j5MuI1ovSKc/s1600/section125.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="240" n4="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBvlAimbI/AAAAAAAAAWE/j5MuI1ovSKc/s320/section125.PNG" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBwq9BcGI/AAAAAAAAAWI/6WdWNZSbKb0/s1600/section125a.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="130" n4="true" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBwq9BcGI/AAAAAAAAAWI/6WdWNZSbKb0/s400/section125a.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;R has built-in &lt;b&gt;&lt;i&gt;&lt;u&gt;jitter&lt;/u&gt;&lt;/i&gt;&lt;/b&gt; function, however,&amp;nbsp;we have to&amp;nbsp;make our own in SAS. This verision doesn't apply Fuzzy yet, only for demo purpose.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

%macro jitter(var, newname, data=last, factor=1, amount=);
/* follow the JITTER function in R, no fuzzy applied yet . 
   if amount is given, then use this value for disturbance
   else if amount is 0, then use the range of &amp;amp;var for disturbance
   else if amount is NULL, then use the smallest difference among
        distinct values of &amp;amp;var for disturbance. if all values are 
        the same, then use the value obtained from range

   if range of &amp;amp;var is 0, then use the lower range for disturbance
   otherwise 
*/
%let blank=%str( );
%if &amp;amp;data=' ' %then %let data=last;
%local fid;

data _null_;
     fid=round(ranuni(0)*10000, 1);
  call symput('fid', compress(fid));
run;
proc means data=&amp;amp;data  noprint;
     var &amp;amp;var;
  output  out=_util&amp;amp;fid  
          range(&amp;amp;var)=range  
          min(&amp;amp;var)=min  
          max(&amp;amp;var)=max;
run;
data _util&amp;amp;fid;
     set _util&amp;amp;fid;
  if range^=0 then z=range; else z=min;
  if z=0 then z=1;
run;

%if %eval(&amp;amp;amount=&amp;amp;blank) %then %do;
    proc sort data=&amp;amp;data.(keep=&amp;amp;var  where=(&amp;amp;var^=.))   
              out=_xuti&amp;amp;fid  nodupkey;
       by &amp;amp;var;
 run;
 data _duti&amp;amp;fid;
      set _xuti&amp;amp;fid  nobs=ntotal  end=eof;      
   array _x{2} _temporary_;
   if ntotal=1 then do;
      amount=&amp;amp;factor/50*abs(&amp;amp;var);
   keep amount;
   output;
   stop;
   end;
   else do;
      if _n_=1 then do; 
         _x[1]=&amp;amp;var; _x[2]=constant('BIG'); 
      end;
   else do;
               _x[2]=min(_x[2], &amp;amp;var - _x[1]);
      _x[1]=&amp;amp;var;
      if eof then do;
         amount=&amp;amp;factor/5*abs(_x[2]);
      keep amount;
      output;
      end;
   end;
   end;
  run;
    
%end;
%else %if %eval(&amp;amp;amount=0) %then %do;
    data _duti&amp;amp;fid;
      set _util&amp;amp;fid;
   amount=&amp;amp;factor*z/50;
   keep amount;
   output;
 run;     
%end;
%else %do;
    data _duti&amp;amp;fid;
      amount=&amp;amp;amount;
   keep amount;
   output;
 run;
%end;

proc sql noprint;
     select name into :keepvars separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' 
    and  memname=%upcase("&amp;amp;data") 
    and  memtype="DATA"
  ;
quit;
data &amp;amp;data;     
  array _x{1} _temporary_;
     if _n_=1 then do;
     set _duti&amp;amp;fid;
  _x[1]=amount;
  end;
     set &amp;amp;data;
  &amp;amp;newname=&amp;amp;var + ranuni(0)*(2*_x[1])-_x[1];
  label &amp;amp;newname="jitter(&amp;amp;var)";
  keep &amp;amp;keepvars  &amp;amp;newname;
run;
proc datasets library=work nolist;
     delete _duti&amp;amp;fid _xuti&amp;amp;fid _util&amp;amp;fid;
run;quit;
%mend; 
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;Now we can apply &lt;b&gt;&lt;i&gt;&lt;u&gt;jitter&lt;/u&gt;&lt;/i&gt;&lt;/b&gt; function to the two variables studied in the book and check them out. For graphics, we use &lt;b&gt;&lt;i&gt;&lt;u&gt;axis&lt;/u&gt;&lt;/i&gt;&lt;/b&gt; statement to define how we want the two axes to appear in print out, and use &lt;b&gt;&lt;i&gt;&lt;u&gt;symbol&lt;/u&gt;&lt;/i&gt;&lt;/b&gt; statement to define the appearance of symbols for data. Here we ask for black circles, not connected (interpol=none).&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

%jitter(hours_of_sleep, xhos, data=student, factor=1, amount=); 
%jitter(ToSleep, xts, data=student, factor=1, amount=);

goptions border  reset=all;
axis1  label=("jitter(ToSleep)")    
       major=(height=2  number=5)   minor=none;
axis2  label=(angle=90  "jitter(Hours of Sleep")     
       major=(height=2  number=5)  minor=none;
symbol1 value=circle  interpol=none  color=black;
proc gplot data=student;
      plot xhos*xts/haxis=axis1  vaxis=axis2;
run;quit;      
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBxala-DI/AAAAAAAAAWM/onK7h89hRro/s1600/section126a.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="464" n4="true" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TSNBxala-DI/AAAAAAAAAWM/onK7h89hRro/s640/section126a.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;In order to overlay a fitted regression line, we have multiple ways to do it in SAS, and the Statistical Plot procedures are particularly handy to generate this type of figures. Compare the code and corresponding figures (in the same order of LINEPRINTER, ODS GRAPHICS, Manually Drawn, SGPLOT). For manually drawn figures, since we ask for connected dots in order to obtain a line overlay on circles, we have to sort the data by X-axis variable to make the line as straight as possible. The manually drawn figure is also the one closest to the R figure in the book.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;


/* old school LINEPRINTER option*/
proc reg data=student  lineprinter;
      title "Line Printer Style";
      model   Hours_of_Sleep=ToSleep /noprint;
   var     xhos  xts;   
   plot pred.*xts='X'  xhos*xts/overlay  symbol='o';
run;quit;
title;

/* Using ODS GRAPHICS */
ods select none; /* to suppress PROC REG output */
title "ODS Graphics";
ods graphics on;
proc reg data=student ;
      model   Hours_of_Sleep=ToSleep;
   var     xhos  xts;
   output  out=student  pred=p;  
      ods select FitPlot; 
run;quit;
ods graphics off;
title;
ods select all;

/* Following standard R approach, tedious in SAS */
proc sort data=student  out=studentsorted;
      by ToSleep;
run;
goptions border  reset=all;
axis1  label=("jitter(ToSleep)")      order=(-2 to 6 by 2)  major=(height=2  number=5)   minor=none;
axis2  label=(angle=90  "jitter(Hours of Sleep")    offset=(, 2 pct)  major=(height=2  number=5)  minor=none;
symbol1 value=circle  interpol=none  color=black;
symbol2 value=none  interpol=line  color=black;
proc gplot data=studentsorted;
      plot xhos*xts  p*xts/overlay  haxis=axis1  vaxis=axis2;   
run;quit;      

/* Using build-in REGRESSION capability of GPLOT by specifying
   INTERPOL=R&amp;lt;&amp;gt;&amp;lt;0&amp;gt;&amp;lt;&amp;gt; option in SYMBOL statement. 
      In the first &amp;lt;&amp;gt;, you identify the type of regression: 
         L=linear, Q=quadratic, C=cubic
      In the second &amp;lt;&amp;gt;, you specify if intercept is omitted;
      In the third &amp;lt;&amp;gt;, you specify type of CI:
         CLM=CI of mean predicted value;
         CLI=CI of individual points;
 */
goptions border reset=all;
axis1  label=("jitter(ToSleep)")      order=(-2 to 6 by 2)  major=(height=2  number=5)   minor=none;
axis2  label=(angle=90 "jitter(Hours of Sleep")    offset=(, 2 pct)  major=(height=2  number=5)  minor=none;
symbol1 value=circle  interpol=rlclm95  color=black  ci=blue  co=blue;
proc gplot data=student;
     plot xhos*xts/regeqn /* haxis=axis1  vaxis=axis2*/;
run;quit;

/* New SGPLOT has build in regression line capability: REG, SPLINE, LOESS */
proc sgplot data=student;
      scatter  x=xts  y=xhos;
   reg  x=ToSleep  y=Hours_of_Sleep;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBya9hX6I/AAAAAAAAAWQ/qCizLA9LQ5c/s1600/section126b.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="598" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBya9hX6I/AAAAAAAAAWQ/qCizLA9LQ5c/s640/section126b.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBzNHcDQI/AAAAAAAAAWU/91tm7KSfpMI/s1600/section126c.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="521" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNBzNHcDQI/AAAAAAAAAWU/91tm7KSfpMI/s640/section126c.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/TSNBzroN42I/AAAAAAAAAWY/1DfpnW6j7lw/s1600/section126d.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="472" n4="true" src="http://2.bp.blogspot.com/_slrAR0IXTL0/TSNBzroN42I/AAAAAAAAAWY/1DfpnW6j7lw/s640/section126d.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TUbXYMTxWRI/AAAAAAAAAXA/eCVIYimSvSo/s1600/section126f.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="451" s5="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TUbXYMTxWRI/AAAAAAAAAXA/eCVIYimSvSo/s640/section126f.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNB0bmZXNI/AAAAAAAAAWc/OwCH4_D_AVE/s1600/section126e.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="548" n4="true" src="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNB0bmZXNI/AAAAAAAAAWc/OwCH4_D_AVE/s640/section126e.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;Now, to study the robustness of T-Statistics which relies on several assumptions, such as normality, homoskedasticity, etc. First, we need to define a &lt;b&gt;&lt;i&gt;&lt;u&gt;"function"&lt;/u&gt;&lt;/i&gt;&lt;/b&gt; to calculate T-statistics like in the book.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

/* section 1.3 Robustness of T-Statistics */
/* section 1.3.2 Write a function to compute T-Statistics */
%macro tstatistics(vars, dsn=last, outdsn=, class=);
%if &amp;amp;class='' %then %let classstmt=;
%else %let classstmt=class &amp;amp;class;

%let nvars=%sysfunc(count(&amp;amp;vars, ' '));

%let var1=%scan(&amp;amp;vars, 1, ' ');
%let var2=%scan(&amp;amp;vars, 2, ' ');
proc means data=&amp;amp;dsn noprint  nway;
     &amp;amp;classstmt;
  var &amp;amp;vars;
  output  out=tdata(where=(_STAT_ in ('STD', 'MEAN', 'N')));
run;
data &amp;amp;outdsn;
     set tdata; by &amp;amp;class _TYPE_;
  retain  m  n  mean1 mean2  std:;
  select(compress(_STAT_));
          when('N') do;
        m=&amp;amp;var1;
     n=&amp;amp;var2;
    end;
    when('MEAN') do;
              mean1=&amp;amp;var1;
     mean2=&amp;amp;var2;
    end;
          when('STD') do;
        std1=&amp;amp;var1;
     std2=&amp;amp;var2;
    end;
    otherwise;
  end;
     if last._TYPE_ then do;
        sp=sqrt(((m-1)*std1**2 + (n-1)*std2**2)/(m+n-2));
     t= (mean1-mean2)/(sp * sqrt(1/m + 1/n));
  keep &amp;amp;class _TYPE_ m  n std:  mean:  sp t;
  output;
  end;
run;
%mend;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;Just like the "source(file)" command in R, in SAS, we have a counterpart called &lt;b&gt;&lt;i&gt;%include&lt;/i&gt;&lt;/b&gt;(or in abv. &lt;b&gt;&lt;i&gt;%inc&lt;/i&gt;&lt;/b&gt;). Suppose you have a SAS file called "tstatistics.sas" in some folder &lt;i&gt;&lt;b&gt;&amp;amp;folder&lt;/b&gt;&lt;/i&gt;, the following statement will read in the source code from this file. Since this is a SAS file, suffix ".sas" is not necessary.&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
filename BCR "&amp;amp;folder";
%inc BCR(tstatistics);
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;The following code pieces conduct the four simulate scenarios illustrated in the book and draw the last figure.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
data test;
      input x y;
cards;
1  5
4  4
3  7
6  6
5  10
;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=);

/* section 1.3.3 Monte Carlo study on the Robustness of T-Statistics */
%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
      retain seed1 99787  seed2 76568;
   do iter=1 to &amp;amp;Niter;
      do j=1 to max(&amp;amp;n, &amp;amp;m);
            if j&amp;lt;=&amp;amp;m then call rannor(seed1, x); else x=.;
   if j&amp;lt;=&amp;amp;n then call rannor(seed2, y); else y=.;
   keep iter x y;
   output;
   end;
    end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
      set tdata2  end=eof;
   retain n_reject 0;
   if abs(t)&amp;gt;quantile('T', 1-&amp;amp;alpha/2, n+m-2) then n_reject+1;
   if eof then do;
      call symput('n_reject', n_reject/_n_);
   end;
run;
%put &amp;amp;n_reject;


/* section 1.3.4 Study the robustness of T-Statistics */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
      retain seed1 987687  seed2 76568;
   array x[4];
   array y[4];
      do iter=1 to &amp;amp;Niter;
         do j=1 to max(&amp;amp;n, &amp;amp;m);
      do k=1 to 4;
      select (k);
         when(1) do;
                      if j&amp;lt;=&amp;amp;m then call rannor(seed1, x[k]); else x[k]=.;      
                      if j&amp;lt;=&amp;amp;n then call rannor(seed2, y[k]); else y[k]=.;
      end;
      when(2) do;
                      if j&amp;lt;=&amp;amp;m then call rannor(seed1, x[k]); else x[k]=.;
                      if j&amp;lt;=&amp;amp;n then call rannor(seed2, y[k]); else y[k]=.;
                      y[k]=y[k]*10;
      end;
      when(3) do;
                      if j&amp;lt;=&amp;amp;m then x[k]=rand('T', 4); else x[k]=.;
                      if j&amp;lt;=&amp;amp;n then y[k]=rand('T', 4); else y[k]=.;  
      end;
      otherwise do;
                      if j&amp;lt;=&amp;amp;m then call rannor(seed1, x[k]); else x[k]=.;
                      if j&amp;lt;=&amp;amp;n then call ranexp(seed2, y[k]); else y[k]=.;
                      x[k]=10+x[k]*2;  y[k]=y[k]/0.1;
      end;
     end;           
   end;
            keep iter x1-x4 y1-y4;
            output;
         end;
      end;
run;

%tstatistics(x1 y1, dsn=test, outdsn=tdata1, class=iter);

%tstatistics(x2 y2, dsn=test, outdsn=tdata2, class=iter);

%tstatistics(x3 y3, dsn=test, outdsn=tdata3, class=iter);

%tstatistics(x4 y4, dsn=test, outdsn=tdata4, class=iter);

data _null_;
      n_reject=0;
      do until (eof1);
         set tdata1  end=eof1  nobs=n1;
         retain n_reject 0;
         if abs(t)&amp;gt;quantile('T', 1-&amp;amp;alpha/2, n+m-2) then n_reject+1;
         if eof1 then do;
             *call symput('n_reject', n_reject/_n_);
       p_reject=n_reject/n1;
       put "NOTE&amp;gt;&amp;gt; Prob(Rejection)=" p_reject  8.4;
         end;
  end;
  n_reject=0;
  do until (eof2);
         set tdata2  end=eof2  nobs=n2;
         retain n_reject 0;
         if abs(t)&amp;gt;quantile('T', 1-&amp;amp;alpha/2, n+m-2) then n_reject+1;
         if eof2 then do;
             *call symput('n_reject', n_reject/_n_);
       p_reject=n_reject/n2;
       put "NOTE&amp;gt;&amp;gt; Prob(Rejection)=" p_reject  8.4;
         end;
  end;

  do until (eof3);
         set tdata3  end=eof3 nobs=n3;
         retain n_reject 0;
         if abs(t)&amp;gt;quantile('T', 1-&amp;amp;alpha/2, n+m-2) then n_reject+1;
         if eof3 then do;
             *call symput('n_reject', n_reject/_n_);
       p_reject=n_reject/n3;
       put "NOTE&amp;gt;&amp;gt; Prob(Rejection)=" p_reject  8.4;
         end;
  end;

  do until (eof4);
         set tdata4  end=eof4  nobs=n4;
         retain n_reject 0;
         if abs(t)&amp;gt;quantile('T', 1-&amp;amp;alpha/2, n+m-2) then n_reject+1;
         if eof4 then do;
             *call symput('n_reject', n_reject/_n_);
       p_reject=n_reject/n4;
       put "NOTE&amp;gt;&amp;gt; Prob(Rejection)=" p_reject  8.4;
         end;
  end;
run;

/* draw the overlay density curves of empirical T-statistics and T-distribution */
proc sort data=tdata2;
      by t;
run;
data tdata2;
      set tdata2;
   dt=pdf('T', t, 18);
run;

ods select none;
ods output Controls=Controls;
ods output UnivariateStatistics=UniStat;
proc kde data=tdata2  ;
      univar  t /out=density  method=snr  unistats;
run;
ods select all;

data density;
      set density;
   dt=pdf('T', value, 18);
run;

data _null_;
      set UniStat;
   if Descr='Bandwidth' then call symput ('bw', compress(round(t, 0.0001)));
run;

goptions reset=all border;

title "Comparing Empirical and Theoretical Densities";
legend label=none   position=(top right inside)  mode=share;
axis1  label=("N=&amp;amp;Niter Bandwidth=&amp;amp;bw") 
       order=(-4 to 8 by 2) offset=(1,1)  
       major=(height=2) minor=none;
axis2  label=(angle=90  "Density")     
       order=(0 to 0.4 by 0.1)  offset=(0, 0.1) 
       major=(height=2)  minor=none;
symbol1  interpol=join  color=black  value=none  width=3;
symbol2  interpol=join  color=grey   value=none  width=1;
proc gplot data=density;
      plot density*value  dt*value/overlay  legend=legend
                                   haxis=axis1  vaxis=axis2  ;
      label dt='t(18)' density='Exact';   
run;quit;

ods pdf close;
&lt;/code&gt;&lt;/pre&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNB1P7VCtI/AAAAAAAAAWg/POzPQyLqUmA/s1600/section133.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="506" n4="true" src="http://1.bp.blogspot.com/_slrAR0IXTL0/TSNB1P7VCtI/AAAAAAAAAWg/POzPQyLqUmA/s640/section133.PNG" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;b&gt;Key Points Summary:&amp;nbsp;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
0. We can&amp;nbsp;use&amp;nbsp;FORMCHAR option to eliminate &amp;nbsp;weird characters&lt;br /&gt;
&lt;br /&gt;
1. We can use PROC IMPORT to import deliminated data and it provided flexible options to control the output;&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;2. We can use PROC FREQ to summarize categorical data&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;3. We can use PROC MEANS and PROC UNIVARIATE to generate descriptive statistics for numerical data&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;4. We can study the dispersion of numerical variable by categorical variable using BOXPLOT&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;5. We provided a JITTER function for better graphical presentation of heavily overlapped data&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;6. We showed that there are multiple ways to visualize scatter plots together with fitted regression lines:&lt;br /&gt;
&lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;ul&gt;&lt;li&gt;6.1 LINEPRINTER in PROC REG&amp;nbsp;&lt;/li&gt;
&lt;li&gt;6.2 Using PROC REG to obtain regression line and overlay it on the scatter plot&amp;nbsp;&lt;/li&gt;
&lt;li&gt;6.3 Using ODS Graphics in PROC REG&amp;nbsp;&lt;/li&gt;
&lt;li&gt;6.4 Using INTERPOL=R&amp;lt;&amp;gt;&amp;lt;&amp;gt;&amp;lt;&amp;gt; option in SYMBOL statement&amp;nbsp;&lt;/li&gt;
&lt;li&gt;6.5 Using PROC SGPLOT&amp;nbsp;&lt;/li&gt;
&lt;/ul&gt;7. We demonstrated how to conduct robust T-test in SAS following the code in the book. &lt;br /&gt;
&lt;div&gt;&lt;/div&gt;&lt;a href="http://www.amazon.com/Bayesian-Computation-R-Use/dp/0387922970?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="Bayesian Computation with R (Use R)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387922970&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387922970" style="border: medium none; margin: 0px; padding: 0px ! important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-6025913615236309735?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/yL6bIedyqD4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/6025913615236309735/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=6025913615236309735" title="3 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/6025913615236309735?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/6025913615236309735?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/yL6bIedyqD4/bayesian-computation-with-sas-1.html" title="Bayesian Computation with SAS (1)." /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/_slrAR0IXTL0/TSNBo0vqOsI/AAAAAAAAAVo/i7CBF565L1E/s72-c/section122.PNG" height="72" width="72" /><thr:total>3</thr:total><feedburner:origLink>http://www.sas-programming.com/2011/01/bayesian-computation-with-sas-1.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DE8BRH46fip7ImA9WhdQFU0.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-8224701217907225470</id><published>2010-10-29T03:51:00.006-04:00</published><updated>2011-08-16T11:00:55.016-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-08-16T11:00:55.016-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC FORMAT" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC MEANS" /><title>Summarize Numerical Data in a Rolling Window</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/l4GtB5WriEQVqW-yAc9wUkT3TRI/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/l4GtB5WriEQVqW-yAc9wUkT3TRI/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/l4GtB5WriEQVqW-yAc9wUkT3TRI/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/l4GtB5WriEQVqW-yAc9wUkT3TRI/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Obtain summary statistics over a rolling window for a given data, usually on a time dimension, is not quit easy in SAS, especially the rolling window may contain different number of records and the maximum number is unknown without pass the data once. For example, given a transaction data over several days, a business analyst wants to summarize the data for each 24 hour period. This is actually a recent question on SAS-L.&lt;br /&gt;
&lt;br /&gt;
There are several approaches. Typically people use an array that is large enough to handle a reasonable guess of maximum number of records within the given interval. Or use a hash table so to manage the data cells dynamically. The coder basically needs to build a stack which is first in first out. Both of these methods are not easy to code and error prone for a beginner.&lt;br /&gt;
&lt;br /&gt;
This rolling window problem can be efficiently solved by using MultiLabel Format, a feature tend to be ignored. Here are two example. We want to summarize for a rolling window of size 5. Pay attention to the second example.&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: rgb(153,153,153) 1px dashed; border-left: rgb(153,153,153) 1px dashed; border-right: rgb(153,153,153) 1px dashed; border-top: rgb(153,153,153) 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
data fmt;
      retain fmtname 'rollwindow'  type 'n'  hlo 'M';
      do start=1 to 10;
         end=start+5; 
        label=cats('time', start);
        output;
      end; 
      hlo='O'; label='Out-Of-Bound';
      output;
run;

data dsn;
      do time=1 to 20;
         x=rannor(0);
         y=ranuni(0);
         output;
      end;
run;

proc format cntlin=fmt; run;

proc means data=dsn noprint;
      class time /preloadfmt  mlf; 
      format time rollwindow.;
      var x y;
      output  out=summary_roll mean(x y)=  std(x y)= /autoname;
run;
    

data dsn2;
      do time=1 to 20;
         k=ranpoi(10, 10);
         do j=1 to k;
            time=time+j/(k+1);
            x=rannor(0); y=ranuni(0);
            output;
         end;
      end;
      drop k j;
run;
proc means data=dsn2  noprint;
      class time/preloadfmt mlf  exclusive;
      format time rollwindow.;
      var x y;
      output  out=summary_roll2  mean()=  std()=/autoname;
run;


/**************************************************** 
          non-rolling but shrinking time window, 
          similar for growing time window 
*****************************************************/
data fmt2;
      retain fmtname 'winx'  type 'n'  hlo 'M';
      do start=1 to 10;
         end=18; 
        label=cats('time', start);
        output;
      end; 
      hlo='O'; label='Out-Of-Bound';
      output;
run;      

proc format cntlin=fmt2  cntlout=fmt_all;
run;

proc means data=dsn2  noprint;
      class time/preloadfmt mlf  exclusive;
      format time winx.;
      var x y;
      output  out=summary_roll3  mean()=  std()=/autoname;
run;

**************** An example **************;
data TradeDate;
input TradeDate yymmdd10.;
format TradeDate yymmdd10.;
cards;
2007-01-04
2007-01-05
2007-01-08
2007-01-09
2007-01-10
2007-01-11
2007-01-12
2007-01-15
2007-01-16
2007-01-17
2007-01-18
2007-01-19
2007-01-22
2007-01-23
2007-01-24
2007-01-25
2007-01-26
2007-01-29
2007-01-30
2007-01-31
 ;
run;
data raw;
input id $ Date_S yymmdd10. +1 Date_e yymmdd10. Buy;
format Date_S Date_E yymmdd10.;
cards;
A001 2007-01-09 2007-01-24 24.5
A001 2007-01-12 2007-01-16 56.6
 ;
run;

/*------------- Desired Output ------------*
id      Date_S       Date_E        Buy    Hold_Days
A001 2007-01-09 2007-01-24 24.5    12
A001 2007-01-12 2007-01-30 56.6     3
-------------------------------------------*/
data fmt;
     set raw;
     retain fmtname 'tdate' type 'n'  hlo 'M';
     start=Date_S; end=Date_e;
     label=cats(ID, _n_);
run;
proc format cntlin=fmt  out=fmt_ref;
run;

proc means data=tradeDate noprint  nway;
     class TradeDate/mlf exclusive preloadfmt ;
     format TradeDate tdate.;
     var TradeDate;
     output  out=_test  n()=_freq_;
run;

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
When the data is getting large, there will be some computing difficulty and insufficient resource. To solve this problem, simply sort and divide the original data into smaller pieces with an overlap equals to the size of rolling window. When combine the summarized pieces, you should discard the overlap part from the rest pieces. &lt;br /&gt;
&lt;br /&gt;
The multilabel approach has an overhead that is smaller overall to the other methods and can be easily changed to accommandate other sizes of rolling windows.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-8224701217907225470?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/NqmgAIBcq6E" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/8224701217907225470/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=8224701217907225470" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8224701217907225470?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/8224701217907225470?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/NqmgAIBcq6E/summary-numerical-data-in-rolling.html" title="Summarize Numerical Data in a Rolling Window" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/10/summary-numerical-data-in-rolling.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEIESHg9cCp7ImA9WhdQEU8.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-869207075818705085</id><published>2010-09-30T19:13:00.004-04:00</published><updated>2011-08-12T00:15:09.668-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-08-12T00:15:09.668-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC PRINCOMP" /><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC SCORE" /><category scheme="http://www.blogger.com/atom/ns#" term="SVD" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLIMMIX" /><title>Low Rank Radial Smoothing using GLIMMIX and its Scoring</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/cgNaS_BQpX5rydntrbdX6S_LSEs/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/cgNaS_BQpX5rydntrbdX6S_LSEs/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/cgNaS_BQpX5rydntrbdX6S_LSEs/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/cgNaS_BQpX5rydntrbdX6S_LSEs/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Low Rank Radial Smoothing using GLIMMIX [1], a semiparametric approach to smooth curves [2]. Specifying TYPE=RSMOOTH option in RANDOM statement, we can implement this spline smooth approach. The best thing is that for future scoring, data preparation is extremely easy by using the OUTDESIGN= &amp;amp; NOFIT options in v9.2 PROC GLIMMIX, then use PROC SCORE twice on this design matrix to score the fixed effects design matrix X and the random effects design matrix Z, respective, add up together is the score from this radial smoothing method.&lt;br /&gt;
&lt;br /&gt;
[Coming soon]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
proc glimmix data=train_data  absconv=0.005;
     model y = &amp;amp;covars /s;
     random &amp;amp;z /s type=rsmooth  knotmethod=equal(20);
run;

proc glimmix data=test  nofit  outdesign=test2;
     model y=&amp;amp;covars /s;
     random &amp;amp;z /s type=rsmooth knotmethod=equal(20);
run;


proc score data=test2  score=beta_fix  type=parms  out=score_fix;
     var  &amp;amp;covars;
run;

proc score data=test2 score=beta_random type=parms  out=score_random;
     var _z:;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
Reference:&lt;br /&gt;
&lt;br /&gt;
1. SAS Institute, Statistical Analysis with the GLIMMIX procedure Course Notes, SAS Press, SAS Institute&lt;br /&gt;
2. D Rupper, M.P. Wand, R.J. Carroll, Semiparametric Regression, Cambridge University Press, Cambridge, 2003&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&lt;a href="http://www.amazon.com/Semiparametric-Regression-Statistical-Probabilistic-Mathematics/dp/0521785162?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="Semiparametric Regression (Cambridge Series in Statistical and Probabilistic Mathematics)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0521785162&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0521785162" style="border: medium none ! important; margin: 0px ! important; padding: 0px ! important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-869207075818705085?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/NoRFHK2z6l4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/869207075818705085/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=869207075818705085" title="5 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/869207075818705085?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/869207075818705085?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/NoRFHK2z6l4/low-rank-radial-smoothing-using-glimmix.html" title="Low Rank Radial Smoothing using GLIMMIX and its Scoring" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>5</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/09/low-rank-radial-smoothing-using-glimmix.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkAGQ3g9eCp7ImA9WhZXFU4.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-4101905517772952711</id><published>2010-09-30T08:41:00.006-04:00</published><updated>2011-05-04T15:52:02.660-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-05-04T15:52:02.660-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC PRINCOMP" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC CORR" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC SCORE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC DISTANCE" /><title>Generalized Discriminant Analysis</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/jsoqfTNUvYaqpgcjtC7OcWq9CtQ/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/jsoqfTNUvYaqpgcjtC7OcWq9CtQ/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/jsoqfTNUvYaqpgcjtC7OcWq9CtQ/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/jsoqfTNUvYaqpgcjtC7OcWq9CtQ/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Using SAS to implement Flexible Discriminant Analysis [Chapter 12, ESL].&lt;br /&gt;
&lt;br /&gt;
Since a discriminant analysis is equivalent to a 2-step process, i.e. regress first then conduct discriminant analysis, it is easy to implement the so called generalized discriminant analysis shown in Ch.12.4--12.6 of Elements of Statistical Learning. The basic idea here is to use some regression analysis procedures, such as using PROC REG and its RIDGE= option in MODEL statement for the ridge regression, and then use the prediction from L2 regularized regression in next step's discriminant analysis. Using PROC GLMSELECT, we can replace L2 regularization with a L1 regularization.&lt;br /&gt;
&lt;br /&gt;
A piece of prototype code looks like this:&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
PROC GLMMOD data=&amp;yourdata  OUTDESIGN=&amp;design;
     CLASS &amp;dep_var;
     model X = &amp;dep_var /noint;
RUN;
data &amp;yourdata;
     merge &amp;yourdata  &amp;design;
     rename Col1-Col&amp;k = Y1 -Y&amp;k;
run;
%let deps= Y1-Y&amp;k; /* for the case of 5-class problem */
PROC REG  DATA=&amp;yourdata RIDGE=&amp;minridge to &amp;maxridge by 0.1  OUTEST=beta;
MODEL &amp;deps = &amp;covars ;
OUTPUT  OUT=predicted  PRED=&amp;dep._HAT;
RUN;

PROC DISCRIM DATA=predicted &amp;options;
CLASS  &amp;dep;
VAR    &amp;dep._HAT;
RUN;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.amazon.com/Elements-Statistical-Learning-Prediction-Statistics/dp/0387848576?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387848576&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387848576" style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none; margin: 0px; padding-bottom: 0px! important; padding-left: 0px! important; padding-right: 0px! important; padding-top: 0px! important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-4101905517772952711?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/2VYbZW1cOpc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/4101905517772952711/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=4101905517772952711" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/4101905517772952711?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/4101905517772952711?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/2VYbZW1cOpc/regularized-discriminant-analysis.html" title="Generalized Discriminant Analysis" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/09/regularized-discriminant-analysis.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CUMMSHczeyp7ImA9Wx5QEU0.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-6855499064591918912</id><published>2010-08-09T16:09:00.006-04:00</published><updated>2010-08-29T13:38:09.983-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-08-29T13:38:09.983-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC FACTOR" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC SCORE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC PLS" /><title>VARIMAX rotation of PLS loadings</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/a3PKM8XkTJW_yOaAsnWmtLk6x8E/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/a3PKM8XkTJW_yOaAsnWmtLk6x8E/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/a3PKM8XkTJW_yOaAsnWmtLk6x8E/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/a3PKM8XkTJW_yOaAsnWmtLk6x8E/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Partial Least Square is one of several supervised dimension reduction techniques and attracts attention in recent years. In the one hand, PLS is able to generate a series of scores that maximize linear correlation between dependent variables and independent variables, on the other hand, the loading of PLS can be regarded as similar counterpart from factor analysis, hence we can rotate the loadings from PLS therefore eliminate some of the non-significant variable in terms of prediction.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
%macro PLSRotate(Loading, TransMat, PatternOut, PatternShort, 
                 method=VARIMAX, threshold=0.25);
/* VARIMAX rotation of PLS loadings. Only variables having 
   large loadings after rotation will enter the final model. 

   Loading dataset contains XLoadings output from PROC PLS 
   and should have variable called NumberOfFactors
   TransMat is the generated Transformation matrix;
   PatternOut is the output Pattern after rotation;
   PatternShort is the output Pattern with selected variables
*/

%local covars;
proc sql noprint;
     select name into :covars separated by ' '
  from   sashelp.vcolumn
  where  libname="WORK" &amp;amp; memname=upcase("&amp;amp;Loading") 
        &amp;amp;   upcase(name) NE "NUMBEROFFACTORS"
  &amp;amp;   type="num"
  ;
quit;
%put &amp;amp;covars;

data &amp;amp;Loading.(type=factor);
         set &amp;amp;Loading;
         _TYPE_='PATTERN';
         _NAME_=compress('factor'||_n_);
run;
ods select none;
ods output OrthRotFactPat=&amp;amp;PatternOut;
ods output OrthTrans=&amp;amp;TransMat; 
proc factor  data=&amp;amp;Loading   method=pattern  rotate=&amp;amp;method  simple; 
         var &amp;amp;covars;
run;
ods select all;

data &amp;amp;PatternShort;
     set &amp;amp;PatternOut;
  array _f{*} factor:;
  _cntfac=0;
  do _j=1 to dim(_f);  
        _f[_j]=_f[_j]*(abs(_f[_j])&amp;gt;&amp;amp;threshold); _cntfac+(_f[_j]&amp;gt;0); 
     end;
  if _cntfac&amp;gt;0 then output;
  drop _cntfac _j;
run;
%mend;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
Here I try to replicate the case study in [1] which elaborated how to do and properties of VARIMAX rotation to PLS loadings. The PROC PLS output, after various tweaks on convergence criteria and singularity conditions, is still a little different from the result reported in [1] for factors other than the leading one, therefore, I will directly use the U=PS matrix in pp.215.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;

data loading;
input factor1-factor3;
cards;
-0.9280  -0.0481  0.2750
0.0563  -0.8833  0.5306
-0.9296  -0.0450  0.2720
-0.7534  0.1705  -0.5945
0.5917  -0.0251  -0.6450
0.9082  0.3345    0.1118
-0.8086  0.4551  -0.3800
;
run;


proc transpose data=loading  out=loading2;
run;

data loading2(type=factor);
     retain _TYPE_ "PATTERN";
  set loading2;
run;


ods select none;
ods output OrthRotFactPat=OrthRotationOut;
ods output OrthTrans=OrthTrans; 
proc factor  data=Loading2   method=pattern  rotate=varimax  simple; 
         var col1-col7;
run;
ods select all;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;br /&gt;
Reference:&lt;br /&gt;
[1]&lt;b&gt; Huiwen Wang; Qiang Liu , Yongping Tu&lt;/b&gt;, "&lt;i&gt;Interpretation of PLS Regression Models with VARIMAX Rotation",&lt;/i&gt; Computational Statistics and Data Analysis, Vol.48 (2005) pp207 – 219&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-6855499064591918912?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/Czh9u2dw6Cw" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/6855499064591918912/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=6855499064591918912" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/6855499064591918912?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/6855499064591918912?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/Czh9u2dw6Cw/varimax-rotation-of-pls-loadings.html" title="VARIMAX rotation of PLS loadings" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/08/varimax-rotation-of-pls-loadings.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DE8CQHc-cSp7ImA9Wx5SFk4.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-5475932701046935991</id><published>2010-08-04T12:06:00.008-04:00</published><updated>2010-08-12T14:14:21.959-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-08-12T14:14:21.959-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Hash Object" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Manipulation" /><category scheme="http://www.blogger.com/atom/ns#" term="Index" /><title>Table Look Up in SAS, practical problems</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/xj4fdEEQBbK002ZZAjjCnbKED4g/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/xj4fdEEQBbK002ZZAjjCnbKED4g/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/xj4fdEEQBbK002ZZAjjCnbKED4g/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/xj4fdEEQBbK002ZZAjjCnbKED4g/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;One guy asked in a SAS forum about a typical table look up problem:&lt;br /&gt;
He has a data with two IDs:&lt;br /&gt;
id1 id2&lt;br /&gt;
a   b&lt;br /&gt;
a   e&lt;br /&gt;
b   c&lt;br /&gt;
b   e&lt;br /&gt;
c   e&lt;br /&gt;
d   e&lt;br /&gt;
&lt;br /&gt;
and he wants to generate a new data set with the following structure according to above information :&lt;br /&gt;
id a b c d e&lt;br /&gt;
a  0 1 0 0 1&lt;br /&gt;
b  1 0 1 0 1&lt;br /&gt;
c  0 1 0 0 1&lt;br /&gt;
d  0 0 0 0 1&lt;br /&gt;
e  1 1 1 1 0&lt;br /&gt;
&lt;br /&gt;
The real data is potentially big.&lt;br /&gt;
***************************;&lt;br /&gt;
At first look, this is a typical table look up problem SAS programmers facing almost everyday, that is duplicate keyed lookup table. It is a simple one because there is no inherent relationship among records.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc datasets library=work nolist;
     modify original;
  index create id1 id2;
quit;

proc sql;
     create table all_cases as
  select a.*, monotonic() as seq
  from (
  select distinct id1 as id
  from original
  union
  select distinct id2 as id
  from original
  ) as a
  order by a.id
  ;
quit;

proc sql noprint;
     select id into :idnames separated by ' '
  from   all_cases
  ;
quit;

data new;
  if _n_=1 then do;
     declare hash _h(dataset:'all_cases');
     _h.defineKey('id');
     _h.defineData('seq');
     _h.defineDone();
     end; 
     set all_cases;

  array _a{*} &amp;idnames; 

  id1=id;  
  set original key=id1;      
  _mx_=%sysrc(_sok);
  
  do while (_iorc_=%sysrc(_sok));   
     rc=_h.find(key:id2); if rc=0 then _a[seq]=1;
  id1=id;
     set original key=id1;  
  
  end;
  _ERROR_=0;
  
  id2=id;  
  set original key=id2;      
  do while (_iorc_=%sysrc(_sok)); 
     rc=_h.find(key:id1); if rc=0 then _a[seq]=1;
  id2=id;
     set original key=id2;  
  end;
  _ERROR_=0;
  do j=1 to dim(_a); _a[j]=max(0, _a[j]); end;
  keep id &amp;idnames;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
On the other hand, this problem can be solved in a more SASsy way like this:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
data original;
   input id1 $ id2 $;
datalines;
a b
a e
b c
b e
c e
d e
;
run;

proc sql;
     create table newx as
     select a.id1, a.id2, (sum(a.id1=c.id1 &amp; a.id2=c.id2)&gt;0) as count
     from   
       (select a.id as id1, b.id as id2 
        from all_cases as a, all_cases as b) as a
left join   original as c
       on   a.id1=c.id1 or a.id2=c.id1
    group by a.id1, a.id2
    ;
quit;

proc transpose data=newx  out=_freq_t name=id2;
     by id1;
     var count;
     id id2;
run;

data _freq_t;
     set _freq_t;
     array _n{*} _numeric_;
     do i=1 to dim(_n);
        _n[i]=(_n[i]&gt;0);
     end;
     drop i;
run;

proc transpose data=_freq_t(drop=id2) out=_freq_t2  name=id1;
     id id1;
run;

proc sql noprint;
     select id1, count(distinct id1) into :covars separated by ' ', :count
     from   _freq_t;  
quit;

data new2;
     set _freq_t;
     array _x{*} &amp;covars;
     array _x2{&amp;count} _temporary_;

     do j=1 to &amp;count; _x2[j]=_x[j]; end;
     set _freq_t2;
     do j=1 to &amp;count; _x[j]=(_x[j]+_x2[j]&gt;0); end;
     drop j id2;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-5475932701046935991?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/ZtARrBGYt4s" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/5475932701046935991/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=5475932701046935991" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5475932701046935991?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5475932701046935991?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/ZtARrBGYt4s/table-look-up-in-sas-practical-problems.html" title="Table Look Up in SAS, practical problems" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/08/table-look-up-in-sas-practical-problems.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUADR304fCp7ImA9Wx5VEEU.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-1890102299009695989</id><published>2010-07-30T00:19:00.009-04:00</published><updated>2010-10-03T02:36:16.334-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-10-03T02:36:16.334-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC DISCRIM" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC CANDISC" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC REG" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC GLMMOD" /><title>An Economic Approach for a Class of Dimensionality Reduction Techniques</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/aA_miOlbV0dESAgnH2eH1fWgcvo/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/aA_miOlbV0dESAgnH2eH1fWgcvo/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/aA_miOlbV0dESAgnH2eH1fWgcvo/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/aA_miOlbV0dESAgnH2eH1fWgcvo/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;Just back from KDD2010. In the conference, there are several papers that interested me.&lt;br /&gt;
&lt;br /&gt;
On the computation side, Liang Sun et al.'s paper [1], "A Scalable Two-Stage Approach for a Class of Dimensionality Reduction Techniques" caught my eyes. Liang proves that a class of dimension reduction techniques, such as CCA, OPLS, LDA, etc, that relies on general eigenvalue decomposition, can be computed in a much cheaper way by decomposing the original computation into a least square problem and a much smaller scale eigenvalue decomposition problem. The equivalence of their two stage approach and direct eigenvalue decomposition is rigourously proved. &lt;br /&gt;
&lt;br /&gt;
This technique is of particular interest to ppl like me that only have limited computing resources and I believe it would be good to implement their algorithm in SAS. For example, a Canonical Discriminant Analysis with above idea is demonstrated below. Note also that by specifing RIDGE= option in PROC REG, the regularized version can be implemented as well, besides, PROC REG is multi-threaded in SAS. Of course, the computing advantage is only appreciatable when the number of features is very large.&lt;br /&gt;
&lt;br /&gt;
The canonical analysis result from reduced version PROC CANDISC is the same as the full version. &lt;br /&gt;
&lt;br /&gt;
In fact, this exercise is the answer for Exercise 4.3 of The Elements of Statistical Learning [2] &lt;br /&gt;
&lt;br /&gt;
[1]. Liang Sun, Betul Ceran, Jieping Ye, "&lt;a href="http://www.public.asu.edu/%7Elsun27/Publications/KDD_2010.pdf"&gt;A Scalable Two-Stage Approach for a Class of Dimensionality Reduction Techniques&lt;/a&gt;", KDD2010, Washington DC. &lt;br /&gt;
&lt;br /&gt;
[2]. Trevor Hastie, Robert Tibshirani, Jerome Friedman, "The Elements of Statistical Learning", 2nd Edition.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.amazon.com/Elements-Statistical-Learning-Prediction-Statistics/dp/0387848576?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387848576&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387848576" style="border: medium none ! important; margin: 0px ! important; padding: 0px ! important;" width="1" /&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;


   proc format; 
      value specname 
         1='Setosa    ' 
         2='Versicolor' 
         3='Virginica '; 
   run; 
 
   data iris; 
      title 'Fisher (1936) Iris Data'; 
      input SepalLength SepalWidth PetalLength PetalWidth 
            Species @@; 
      format Species specname.; 
      label SepalLength='Sepal Length in mm.' 
            SepalWidth ='Sepal Width in mm.' 
            PetalLength='Petal Length in mm.' 
            PetalWidth ='Petal Width in mm.'; 
      symbol = put(Species, specname10.); 
      datalines; 
   50 33 14 02 1 64 28 56 22 3 65 28 46 15 2 67 31 56 24 3 
   63 28 51 15 3 46 34 14 03 1 69 31 51 23 3 62 22 45 15 2 
   59 32 48 18 2 46 36 10 02 1 61 30 46 14 2 60 27 51 16 2 
   65 30 52 20 3 56 25 39 11 2 65 30 55 18 3 58 27 51 19 3 
   68 32 59 23 3 51 33 17 05 1 57 28 45 13 2 62 34 54 23 3 
   77 38 67 22 3 63 33 47 16 2 67 33 57 25 3 76 30 66 21 3 
   49 25 45 17 3 55 35 13 02 1 67 30 52 23 3 70 32 47 14 2 
   64 32 45 15 2 61 28 40 13 2 48 31 16 02 1 59 30 51 18 3 
   55 24 38 11 2 63 25 50 19 3 64 32 53 23 3 52 34 14 02 1 
   49 36 14 01 1 54 30 45 15 2 79 38 64 20 3 44 32 13 02 1 
   67 33 57 21 3 50 35 16 06 1 58 26 40 12 2 44 30 13 02 1 
   77 28 67 20 3 63 27 49 18 3 47 32 16 02 1 55 26 44 12 2 
   50 23 33 10 2 72 32 60 18 3 48 30 14 03 1 51 38 16 02 1 
   61 30 49 18 3 48 34 19 02 1 50 30 16 02 1 50 32 12 02 1 
   61 26 56 14 3 64 28 56 21 3 43 30 11 01 1 58 40 12 02 1 
   51 38 19 04 1 67 31 44 14 2 62 28 48 18 3 49 30 14 02 1 
   51 35 14 02 1 56 30 45 15 2 58 27 41 10 2 50 34 16 04 1 
   46 32 14 02 1 60 29 45 15 2 57 26 35 10 2 57 44 15 04 1 
   50 36 14 02 1 77 30 61 23 3 63 34 56 24 3 58 27 51 19 3 
   57 29 42 13 2 72 30 58 16 3 54 34 15 04 1 52 41 15 01 1 
   71 30 59 21 3 64 31 55 18 3 60 30 48 18 3 63 29 56 18 3 
   49 24 33 10 2 56 27 42 13 2 57 30 42 12 2 55 42 14 02 1 
   49 31 15 02 1 77 26 69 23 3 60 22 50 15 3 54 39 17 04 1 
   66 29 46 13 2 52 27 39 14 2 60 34 45 16 2 50 34 15 02 1 
   44 29 14 02 1 50 20 35 10 2 55 24 37 10 2 58 27 39 12 2 
   47 32 13 02 1 46 31 15 02 1 69 32 57 23 3 62 29 43 13 2 
   74 28 61 19 3 59 30 42 15 2 51 34 15 02 1 50 35 13 03 1 
   56 28 49 20 3 60 22 40 10 2 73 29 63 18 3 67 25 58 18 3 
   49 31 15 01 1 67 31 47 15 2 63 23 44 13 2 54 37 15 02 1 
   56 30 41 13 2 63 25 49 15 2 61 28 47 12 2 64 29 43 13 2 
   51 25 30 11 2 57 28 41 13 2 65 30 58 22 3 69 31 54 21 3 
   54 39 13 04 1 51 35 14 03 1 72 36 61 25 3 65 32 51 20 3 
   61 29 47 14 2 56 29 36 13 2 69 31 49 15 2 64 27 53 19 3 
   68 30 55 21 3 55 25 40 13 2 48 34 16 02 1 48 30 14 01 1 
   45 23 13 03 1 57 25 50 20 3 57 38 17 03 1 51 38 15 03 1 
   55 23 40 13 2 66 30 44 14 2 68 28 48 14 2 54 34 17 02 1 
   51 37 15 04 1 52 35 15 02 1 58 28 51 24 3 67 30 50 17 2 
   63 33 60 25 3 53 37 15 02 1 
   ; 
   proc candisc data=iris out=outcan distance anova; 
      class Species; 
      var SepalLength SepalWidth PetalLength PetalWidth; 
   run;
 
  ods select none;
  proc glmmod data=iris  outdesign=H(keep=COL:);
           class  Species;
     model SepalLength=Species/noint;
  run;  

  data H;
          merge H   iris;
  run;

/**************************
for efficiency consideration, a view can also be used:
data H/view=H;
     set iris;
     array _S{*} Col1-Col3 (3*0);     
     do j=1 to dim(_S); _S[j]=0; end;
     _S[Species]=1;
     drop j;
run;
****************************/
  proc reg data=H  outest=beta;
          model Col1-Col3 = SepalLength SepalWidth PetalLength PetalWidth;
    output   out=P  p=yhat1-yhat3;
  run;quit;
  ods select all;


  proc candisc  data=P;
          class Species;
    var   yhat1-yhat3;
  run;

&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-1890102299009695989?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/z9cIg-wjxLc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/1890102299009695989/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=1890102299009695989" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/1890102299009695989?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/1890102299009695989?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/z9cIg-wjxLc/economic-approach-for-class-of.html" title="An Economic Approach for a Class of Dimensionality Reduction Techniques" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/07/economic-approach-for-class-of.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUEBR34-fip7ImA9WxFaEU8.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-7542050131525605054</id><published>2010-07-12T23:06:00.002-04:00</published><updated>2010-07-14T13:14:16.056-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-07-14T13:14:16.056-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC PRINCOMP" /><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC SCORE" /><category scheme="http://www.blogger.com/atom/ns#" term="PROC FASTCLUS" /><category scheme="http://www.blogger.com/atom/ns#" term="SVD" /><title>Implement Randomized SVD in SAS</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/dH5Ih9YvIfBBiguGyHxkBLX_RNA/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/dH5Ih9YvIfBBiguGyHxkBLX_RNA/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/dH5Ih9YvIfBBiguGyHxkBLX_RNA/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/dH5Ih9YvIfBBiguGyHxkBLX_RNA/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;In the 2010 SASware Ballot®, a dedicated PROC for Randomized SVD was among the options. While an official SAS PROC will not be available in the immediate future as well as in older SAS releases, it is fairly simple to implement this algorithm using existing SAS/STAT procedures.&lt;br /&gt;
&lt;br /&gt;
Randomized SVD will be useful for large scale, high dimension data mining problems, for instance Text Mining. In SAS/Base and SAS/STAT, lack of sparse matrix operation capability puts any serious Text Mining task at the edge of infeasibility, such as using LSI or NMF algorithms. Randomized SVD provides an economic alternate solution by sacrificing a little accuracy which is bounded under the three sampling schema proposed by the authors [1], while the code below demos sampling schema 1.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

/* Randomized SVD with sampling schema 1. */
%let dim=2048;
%let nobs=1e4;
%let s=256;
data matrix;
     array _x{*} x1-x&amp;amp;dim;
     do id=1 to &amp;amp;nobs;
     do _j=1 to dim(_x); _x[_j]=sin(mod(id, _j))+rannor(id); end;
  output;
  drop _j;
  end;
run;

%let datetime_start = %sysfunc(TIME()) ;
%let time=%sysfunc(datetime(), datetime.); %put &amp;amp;time;
data seed;
     array _x{*} x1-x&amp;amp;dim;
     do _j=1 to dim(_x); _x[_j]=0; end;
  output;
  stop;
run;

proc fastclus data=matrix  seed=seed      out=norm(keep=ID DISTANCE)
              maxiter=0    maxclusters=1  noprint  replace=none;
     var x1-x&amp;amp;dim;
run;
data normv/ view=normv;
     set norm(keep=DISTANCE);
     DISTANCE2=DISTANCE**2;
     drop DISTANCE;
run;
proc means data=normv noprint;
     var DISTANCE2;
     output  out=matrixnorm  sum(DISTANCE2)=Frobenius_sqr;
run;
data prob;
     set matrixnorm ;
  retain Frobenius_sqr;
  do until (eof);
     set norm  end=eof;
  _rate_=DISTANCE**2/Frobenius_sqr;
  keep ID _rate_;
  output;
  end;
run;

data matrixv/view=matrixv;
     merge matrix  prob(keep=_rate_);
run;

ods select none;
proc surveyselect data=matrixv  out=matrixsamp(drop=SamplingWeight  ExpectedHits  NumberHits _rate_)  
                   sampsize=&amp;amp;s  method=pps_wr   outhits  ;
     size _rate_;
run;
ods select all;

proc transpose data=matrixsamp  out=matrixsamp;
     var x1-x&amp;amp;dim;
run;

proc princomp data=matrixsamp  outstat=testv(where=(_type_ in ("USCORE")))  
              noint  cov  noprint;
  var col1-col&amp;amp;s;
run;
data testV_t/view=testV_t;
     retain _TYPE_ 'PARMS';
  set testv(drop=_TYPE_);
run;     

proc score data=matrixsamp   score=testV_t  type=parms  
           out=SW(keep=ID Prin:);
    var   col1-col&amp;amp;s;
run;

data seed;
     array _s{*}  prin1-prin&amp;amp;s;
  do _j=1 to dim(_s); _s[_j]=0; end;
  drop _j; output; stop;
run;

proc fastclus data=SW   seed=seed  maxiter=0  maxc=1  replace=NONE   out=SW2(drop=CLUSTER)  noprint;
     var prin1-prin&amp;amp;s;
run;

data HHT;
     set SW2;
  array _x{*}  prin1-prin&amp;amp;s;
  do _j=1 to dim(_x); _x[_j]=(_x[_j]/distance)**2; end;
  drop _j  distance;
run;

proc transpose data=HHT  out=HHT2(drop=_LABEL_);      
run;
data HHT2; 
     _TYPE_='PARMS';
     set HHT2; 
  rename COL1-COL2048=x1-X2048;
run;

proc score data=matrix   score=HHT2  type=parms  out=P(drop=x:);
     var x:;
run;     

%let time=%sysfunc(datetime(), datetime.); %put &amp;amp;time;
%put PROCESSING TIME:  %sysfunc(putn(%sysevalf(%sysfunc(TIME())-&amp;amp;datetime_start.),mmss.)) (mm:ss) ;
options notes source;

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
Reference:&lt;br /&gt;
[1], &lt;strong&gt;&lt;em&gt;P. Drineas and M. W. Mahoney&lt;/em&gt;&lt;/strong&gt;, "Randomized Algorithms for Matrices and Massive Data Sets", Proc. of the 32nd Annual Conference on Very Large Data Bases (VLDB), p. 1269, 2006.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-7542050131525605054?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/lNKI03L8_28" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/7542050131525605054/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=7542050131525605054" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/7542050131525605054?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/7542050131525605054?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/lNKI03L8_28/implement-randomized-svd-in-sas.html" title="Implement Randomized SVD in SAS" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/07/implement-randomized-svd-in-sas.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUIDSH86eSp7ImA9WxFaEU8.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-26676881494384467</id><published>2010-06-25T21:48:00.012-04:00</published><updated>2010-07-14T13:12:59.111-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-07-14T13:12:59.111-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC FASTCLUS" /><category scheme="http://www.blogger.com/atom/ns#" term="SVD" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><title>"Entrywise" Norm calculation using PROC FASTCLUS</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/rSE7F2ZwZtq8H8hchiyCTPIjaQo/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/rSE7F2ZwZtq8H8hchiyCTPIjaQo/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/rSE7F2ZwZtq8H8hchiyCTPIjaQo/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/rSE7F2ZwZtq8H8hchiyCTPIjaQo/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;In some data mining applications, matrix norm has to be calculated, for instance [1]. You can find a detailed explanation of Matrix Norm on Wiki @ &lt;a href="http://en.wikipedia.org/wiki/Matrix_norm"&gt;Here&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Instead of user written routine in DATA STEP, we can obtain "Entrywise" norm via PROC FASTCLUS efficiently and accurately.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; height: 658px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 98.32%;"&gt;&lt;code&gt;
data matrix;
     input X1-X5;
datalines;
1 2 4 5 6
7 8 9 0 1
2 3 4 5 6
3 4 5 6 7
7 8 9 0 2
2 4 6 8 0
;
run;

data seed;
     input X1-X5;
datalines;
0 0 0 0 0
;
run;

options nosource;
proc export data=matrix  outfile='c:\matrix.csv'  dbms=csv replace; run;
options source;

proc fastclus data=matrix  seed=seed      out=norm(keep=DISTANCE)
              maxiter=0    maxclusters=1  noprint  ;
     var x1-x5;
run;

/* 
In output file NORM, variable DISTANCE is the square root of Frobenius norm. If LEAST=P option is specified, then p-norm is calculated. In PROC FASTCLUS, you can specify p in the range of  [1, \inf].

Now what you got is vector norm for each row, taking the sum of squares of DISTANCE, you obtain the Frobenius norm of the data matrix, which can be easily obtained through PROC MEANS on a data view: 
*/
data normv/ view=normv;
     set norm(keep=DISTANCE);
     DISTANCE2=DISTANCE**2;
     drop DISTANCE;
run;
proc means data=normv noprint;
     var DISTANCE2;
     output  out=matrixnorm  sum(DISTANCE2)=Frobenius_sqr;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
You can use the following R code to verify the results;&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
mat &amp;lt;- read.csv('c:/matrix.csv', header=T)
#verify vector norm
vnorm &amp;lt;- apply(mat, 1, function(x){sqrt(sum(x^2))});
#verify norm of the matrix
x&amp;lt;-as.matrix(mat)
sqrt(sum(diag(t(x)%*%x)))
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
PS: &lt;br /&gt;
1. Of course, above process is designed for implementing the randomized SVD in [1]. If only the matrix Frobenius norm is of interests, you can also use the following code snippet:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; height: 380px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 98.32%;"&gt;&lt;code&gt;
data matrixv/view=matrixv;
     set matrix;
     array _x{*}  x1-x5;
     array _y{*}  y1-y5;
     do j=1 to dim(_x);  _y[j]=_x[j]**2; end;
     keep y1-y5;
run;

proc means data=matrixv  noprint;
     var y1-y5;
     output  out=_var(drop=_TYPE_  _FREQ_)   sum()=/autoname;
run;

data _null_;
     set _var;  
     norm=sqrt(sum(of _numeric_));
     put norm=;
run;
/* --LOG WRITES:
norm=28.635642127
NOTE: There were 1 observations read from the data set WORK._VAR.
*/

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
2. Using its built-in computing engine for Eucleadian Distance, PROC FASTCLUS is also a powerful tool to search for the data point in main table that is CLOEST to the a record in lookup table. This technique is shown&amp;nbsp;&lt;a href="http://www.sas-programming.com/2009/09/tweak-proc-fastclus-for-1-nearest.html"&gt;Here&lt;/a&gt; and [2].&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;strong&gt;&lt;em&gt;Reference:&lt;/em&gt;&lt;/strong&gt;&lt;br /&gt;
[1], &lt;strong&gt;P. Drineas and M. W. Mahoney&lt;/strong&gt;, "&lt;em&gt;Randomized Algorithms for Matrices and Massive Data Sets&lt;/em&gt;", Proc. of the 32nd Annual Conference on Very Large Data Bases (VLDB), p. 1269, 2006.&lt;br /&gt;
&lt;br /&gt;
[2], &lt;strong&gt;Dorfman, Paul M.; Vyverman, Koen; Dorfman, Victor P.,&lt;/strong&gt; "&lt;em&gt;Black Belt Hashigana&lt;/em&gt;", Proc. of the 2010 SAS Global Forum, Seattle, WA, 2010&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-26676881494384467?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/RjNo7zm2JHY" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/26676881494384467/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=26676881494384467" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/26676881494384467?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/26676881494384467?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/RjNo7zm2JHY/entrywise-norm-calculation-using-proc.html" title="&quot;Entrywise&quot; Norm calculation using PROC FASTCLUS" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/06/entrywise-norm-calculation-using-proc.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkcNRHoyeSp7ImA9WxFUFU8.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-2897517425666845174</id><published>2010-06-01T17:20:00.004-04:00</published><updated>2010-06-25T22:41:35.491-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-06-25T22:41:35.491-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="Boost Algorithms" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><title>Boost to tackle nonlinearity</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/vxjN9NmrYX5QuayULzfxI_qWna8/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/vxjN9NmrYX5QuayULzfxI_qWna8/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/vxjN9NmrYX5QuayULzfxI_qWna8/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/vxjN9NmrYX5QuayULzfxI_qWna8/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
data nonlinear;
     do x=1 to 627;
     p=(sin(x/100)+1)*0.45;
     do j=1 to 100;
     x1=x+(j-1)/100;
     if ranuni(8655645)&amp;lt;=p then y=1; else y=0;
     output; drop p j;
  end;
  end;
run;

proc rank data=nonlinear  out=nonlinearrank groups=20;
     var x1;
  ranks rank1;
run;

proc means data=nonlinearrank noprint;
     class rank1;
  var y x1;
  output out=_mean(where=(_type_=1))  mean(y)=y  mean(x1)=x1;
run;

%inc "C:\Documents and Settings\lxie\Desktop\SAS Prog and Docs\Boost macro2 ver3.0.sas";
%inc "C:\Documents and Settings\lxie\Desktop\SAS Prog and Docs\predict macro.sas";

%boost2(nonlinear, 1, outputds,  outwts, 100, 3);

%macro pred(niter);
%do i=1 %to &amp;amp;niter;
  %predict(nonlinear, 1, outputds, outlogds, out_pred, sumpred&amp;amp;i, 3, &amp;amp;i);
%end;
data sumpred_all;
     merge %do i=1 %to &amp;amp;niter;
            sumpred&amp;amp;i.(rename=(sum_pred=sum_pred&amp;amp;i))
     %end;;
run;
%mend;

%pred(100);

proc datasets library=work nolist;
     delete sumpred1-sumpred100;
quit;

options nosource;
proc export data=sumpred_all  outfile="c:\sumpred.csv"  dbms=csv replace; run;
options source;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
In R, use this code piece to recover the animation:&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
library(caTools);
test&amp;lt;-read.csv('c:/sumpred.csv', header=T)
minmax&amp;lt;-quantile(as.matrix(test), c(0,1))
sz=314; p=100;
y&amp;lt;-sort(sample(1:62700, size=sz))
image=array(0, c(sz, p, 100))
for (i in (1:100)){
  pic=matrix(0, ncol=p, nrow=sz)
  trace=round((test[y,i]-minmax[1])/(minmax[2]-minmax[1])*p);
  for(j in (1:sz)){
     image[j, trace[j], i]=1
  }
}
write.gif(image, "c:/boost.gif", col=gray(1:2/2))
im = read.gif("c:/boost.gif")
for(i in 1:100){
   image(im$image[,,i], col=(im$col), 
         main=paste('Iter', i), y=1:100, x=1:314, 
         xlab="Index", ylab="Percentage of 1")
}

test2&amp;lt;-test[,-1]-test[-100]
test2&amp;lt;-cbind(test[,1], test2)

par(mfrow=c(2,1))
for (i in (1:100)){
    plot(test2[,i], type='l', ylim=c(-0.5, 1), 
         main=paste('Iteration', i, 'Delta'),
         ylab='Delta')
    plot(test[,i], type='l', ylim=minmax, 
         main=paste('Iteration', i, 'Score'),
         ylab='Score')
    Sys.sleep(0.1)
}

&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none; clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/TAWGGMbs65I/AAAAAAAAATQ/LC_o7LTZT90/s1600/boost.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" gu="true" height="638" src="http://4.bp.blogspot.com/_slrAR0IXTL0/TAWGGMbs65I/AAAAAAAAATQ/LC_o7LTZT90/s640/boost.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-2897517425666845174?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/YJDxnC5ICmQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/2897517425666845174/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=2897517425666845174" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2897517425666845174?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2897517425666845174?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/YJDxnC5ICmQ/boost-to-tackle-nonlinearity.html" title="Boost to tackle nonlinearity" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/_slrAR0IXTL0/TAWGGMbs65I/AAAAAAAAATQ/LC_o7LTZT90/s72-c/boost.png" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/06/boost-to-tackle-nonlinearity.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A08NQXs9eip7ImA9WxFQEUo.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-775463015665104754</id><published>2010-05-05T18:18:00.007-04:00</published><updated>2010-05-06T17:04:50.562-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-05-06T17:04:50.562-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="PROC DISCRIM" /><category scheme="http://www.blogger.com/atom/ns#" term="Nearest Neighbor" /><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="KNN" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><title>K-Nearest Neighbor in SAS</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/AAI9N0wNnrKWm2ODbaRacE26u8g/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/AAI9N0wNnrKWm2ODbaRacE26u8g/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/AAI9N0wNnrKWm2ODbaRacE26u8g/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/AAI9N0wNnrKWm2ODbaRacE26u8g/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;K-Nearest-Neighbor, aka KNN, is a widely used data mining tool and is often called memory-based/case-based/instance-based method as no model is fit. A good introduction to KNN can be find at [1], or @ &lt;a href="http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm"&gt;Wiki&lt;/a&gt;. &lt;br /&gt;
&lt;br /&gt;
Typically, KNN algorithm relies on a sophisticated data structure called kd-Tree [2] to quickly find the cloeset points for a given point in high dimensional space. While you can find good pseudo-code for kd-Tree implementation and KNN online everywhere, for example [3], it is not trivial to implement your own in SAS [I mean an efficient one]. For most analysts, the first idea is to turn to your &lt;b&gt;$200K-Initial-Fee-AND-$60K-Per-Year&lt;/b&gt; Enerprise Miner(R) for this method, however, it turns out that PROC DISCRIM in SAS/STAT is able to do the same thing! Note that annual license fee for SAS/STAT is a tiny fraction of EM and most analysts have ready access to SAS/STAT.&lt;br /&gt;
&lt;br /&gt;
Simply ask PROC DISCRIM to use nonparametric method by using option "METHOD=NPAR K=". Note that do not use "R=" option at the same time, which corresponds to radius-based of nearest-neighbor method. Also pay attention to how PROC DISCRIM treat categorical data automatically. Sometimes, you may want to change categorical data into metric coordinates in advance.&lt;br /&gt;
&lt;br /&gt;
Because KNN is a memory-based method, when you score the test data or new data in production, you have to use the raw table in the scoring process. Test different values of K using Cross-Validation to select the best one [you need a macro loop then.]&lt;br /&gt;
&lt;br /&gt;
PROC DISCRIM uses memory approximately proportional to the second order of number of variables and time usage, excluding I/O time, is roughly proportional to log(N)*(N*P) where N is the number of observations and P is the number of variables used as it uses the tree search algorithm in [4]. &lt;br /&gt;
&lt;br /&gt;
Therefore, time consumption is still a concern on large data set. As an experiment, I conducted KNN on data set with records ranging from 5000 to 50000 while scoring the same amount of records at the same time, using 20 features and let k=11, we observe:&lt;br /&gt;
&lt;br /&gt;
obs&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; time (sec)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; memory (KB)&lt;br /&gt;
5000&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.03&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1068&lt;br /&gt;
10000&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3.90&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;1949&lt;br /&gt;
15000&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 8.65&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2830&lt;br /&gt;
20000&amp;nbsp;&amp;nbsp;&amp;nbsp; 15.48&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 3709&lt;br /&gt;
25000&amp;nbsp; &amp;nbsp; 34.58&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 4587&lt;br /&gt;
30000&amp;nbsp;&amp;nbsp;&amp;nbsp; 70.26&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 5472&lt;br /&gt;
35000&amp;nbsp; 109.83&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 6350&lt;br /&gt;
40000&amp;nbsp; 161.71&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 7229&lt;br /&gt;
45000&amp;nbsp; 208.80&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 8108&lt;br /&gt;
50000&amp;nbsp; 263.58&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 8987&lt;br /&gt;
&lt;br /&gt;
Empirical time usage is roughly quadratic in the multiplier of per 5000 observations, which means to work on a data set with 300K observations and score the same number of records, SAS needs to take 3.75 hours! Large K value will only&amp;nbsp;increase time used by a very small fraction, though.&lt;br /&gt;
Sample code using Iris data from SAS Online Doc for v9.1.3:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
ods select none;
proc surveyselect data=iris  out=iris2  
                  samprate=0.5  method=srs  outall;
run;
ods select all;

%let k=5;
proc discrim data=iris2(where=(selected=1))   
             test=iris2(where=(selected=0))
             testout=iris2testout
             method=NPAR k=&amp;amp;k 
             listerr crosslisterr; 
      class Species; 
      var SepalLength SepalWidth PetalLength PetalWidth; 
      title2 'Using KNN on Iris Data'; 
run; 

proc freq data=iris2testout;
     table Species*_INTO_;
run;
&lt;/code&gt;&lt;/pre&gt;&lt;br /&gt;
Reference:&lt;br /&gt;
[1]. &lt;b&gt;Trevor Hastie, Robert Tibshirani, Jerome Friedman&lt;/b&gt;, "Elements of Statistical Learning", Ed.2, Springer, 2008;&lt;br /&gt;
[2]. &lt;b&gt;J. L. Bentley&lt;/b&gt;. "Multidimensional binary search trees used for associative searching", Communications of the ACM, 18(9):509-517, 1975;&lt;br /&gt;
[3]. &lt;a href="http://simsearch.yury.name/tutorial.html"&gt;http://simsearch.yury.name/tutorial.html&lt;/a&gt;&amp;nbsp;;&lt;br /&gt;
[4]. &lt;strong&gt;Friedman, J.H., Bentley, J.L., and Finkel, R.A&lt;/strong&gt;., "An Algorithm for Finding Best Matches in Logarithmic Expected Time," ACM Transactions on Mathematical Software, 3, 209 - 226. 1977;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.amazon.com/Elements-Statistical-Learning-Prediction-Statistics/dp/0387848576?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition (Springer Series in Statistics)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=0387848576&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=0387848576" style="border-bottom: medium none; border-left: medium none; border-right: medium none; border-top: medium none; margin: 0px; padding-bottom: 0px! important; padding-left: 0px! important; padding-right: 0px! important; padding-top: 0px! important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-775463015665104754?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/D7Ib6XRCVGA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/775463015665104754/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=775463015665104754" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/775463015665104754?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/775463015665104754?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/D7Ib6XRCVGA/k-nearest-neighbor-in-sas.html" title="K-Nearest Neighbor in SAS" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/05/k-nearest-neighbor-in-sas.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck8GRnw9fCp7ImA9Wx5TGEo.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-5514905091651473965</id><published>2010-05-05T11:50:00.006-04:00</published><updated>2010-08-03T17:27:07.264-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-08-03T17:27:07.264-04:00</app:edited><title>Next Project: Regularized Logistic Regression</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/cHNFX25MB-HlqmavZ-ZzT8_4NI0/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/cHNFX25MB-HlqmavZ-ZzT8_4NI0/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/cHNFX25MB-HlqmavZ-ZzT8_4NI0/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/cHNFX25MB-HlqmavZ-ZzT8_4NI0/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;L1 Regularized Logistic Regression effectively handles large number of predictors and serves variable selection simultaneously. [1] indicates that L1 RLR can be implemented via IRLS-LARS algorithm. You can tweak PROC GLMSELECT in v9.2 for this.&lt;br /&gt;
&lt;br /&gt;
L2 Reguarlized Logistic Regression can be used to approximate SVM solutions [2], and can be implemented via TR-IRLS as suggested by [3], which is a ridge LR. &lt;br /&gt;
&lt;br /&gt;
Reference:&lt;br /&gt;
[1]&lt;b&gt;Su-In Lee, Honglak Lee, Pieter Abbeel and Andrew Y. Ng&lt;/b&gt;, "Efficient L1 Regularized Logistic Regression", Proceedings of Annual Conference of American Association for Artificial Intelligence, 2006&lt;br /&gt;
&lt;br /&gt;
[2] &lt;b&gt;Jian Zhang, Rong Jin, Yiming Yang &amp;amp; Alex G. Hauptmann&lt;/b&gt;, "Modified Logistic Regression: An Approximation to SVM and Its Applications in Large-Scale Text Categorization", Proceedings of the Twentieth International Conference on Machine Learning (ICML-2003), Washington DC, 2003.&lt;br /&gt;
&lt;br /&gt;
[3]&lt;b&gt;Paul Komarek&lt;/b&gt;, "Logistic Regression for Data Mining and High-Dimensional Classification", Ph.D Dissertation, Robotics Institute, Carnegie Mellon University, 2004&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;

&lt;/code&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-5514905091651473965?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/lfDWftnvgKY" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/5514905091651473965/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=5514905091651473965" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5514905091651473965?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5514905091651473965?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/lfDWftnvgKY/next-project-regularized-logistic.html" title="Next Project: Regularized Logistic Regression" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/05/next-project-regularized-logistic.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CUYERHc_eyp7ImA9WhRUFko.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-2634090998737172251</id><published>2010-04-30T16:27:00.002-04:00</published><updated>2012-01-27T09:05:05.943-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-01-27T09:05:05.943-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>Conduct R analysis within SAS</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/bj0YF-vqJYJjRD2FkPSauwacKls/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/bj0YF-vqJYJjRD2FkPSauwacKls/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/bj0YF-vqJYJjRD2FkPSauwacKls/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/bj0YF-vqJYJjRD2FkPSauwacKls/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;&lt;br /&gt;
R is attractive to statistical analysts for its ease of use and ready access of packages implementing modern methodologies. If you have IML, you can submit R commands within SAS/IML enviornment, see Rick's post @ &lt;a href="http://blogs.sas.com/content/iml/2011/05/13/calling-r-from-sasiml-software/" target="_blank"&gt;here&lt;/a&gt;. Unfortunately, not all analysts have licensed IML. To work around this limitation, I proposed the following technique to submit R statements in a SAS/Base enviornment. &lt;br /&gt;
&lt;br /&gt;
To exchange data between R and SAS, I pushed SAS dataset into a CSV file that can be read by R. Instead, with some more coding, we can leverage the 'sas7bdat' R package to read SAS data directly into R, see Charlie Huang's blog @ &lt;a href="http://www.sasanalysis.com/2011/07/sas-dataset-declassified-by-matt.html" target="_blank"&gt;here&lt;/a&gt; for a demonstration.&lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border-bottom: rgb(153,153,153) 1px dashed; border-left: rgb(153,153,153) 1px dashed; border-right: rgb(153,153,153) 1px dashed; border-top: rgb(153,153,153) 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"&gt;&lt;code&gt;
%macro RScript(Rscript);
data _null_;
     file "&amp;amp;Rscript";
     infile cards;
     input;
     put _infile_;
%mend;

 
%macro CallR(Rscript, Rlog);
systask command "C:\Progra~1\R\R-2.8.0\bin\R.exe CMD BATCH --vanilla --quiet
                    &amp;amp;Rscript  &amp;amp;Rlog "
        taskname=rjob1  wait  status=rjobstatus1;
%mend;

/****************************/
data a;
     length i 4;
     array _x{100} x1-x100;
     do i=1 to 300;
        do j=1 to dim(_x); _x[j]=rannor(98765); end;
        output;    drop j;
     end;
run;

proc export data=a  outfile="c:\a.csv"  dbms=csv; run;

%RScript(c:\rscript.r)
cards4;
dcsv &amp;lt;- read.csv('c:/a.csv', header=T);
dsvd&amp;lt;-svd(dcsv[,2:101]);
dsvd$u[1:5, 1:5];
dsvd$d[1:8];
;;;;
run;

%CallR(c:/rscript.r, c:/rlog1.txt);

data _null_;
     infile "c:\rlog1.txt";
     input;
     put _infile_;
run;
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/S9tK4pE8z5I/AAAAAAAAASs/OaHm21mxkzE/s1600/SAS2R.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/_slrAR0IXTL0/S9tK4pE8z5I/AAAAAAAAASs/OaHm21mxkzE/s640/SAS2R.PNG" tt="true" width="576" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-2634090998737172251?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/H1aGxVdOGAQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/2634090998737172251/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=2634090998737172251" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2634090998737172251?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/2634090998737172251?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/H1aGxVdOGAQ/conduct-r-analysis-within-sas.html" title="Conduct R analysis within SAS" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/_slrAR0IXTL0/S9tK4pE8z5I/AAAAAAAAASs/OaHm21mxkzE/s72-c/SAS2R.PNG" height="72" width="72" /><thr:total>1</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/04/conduct-r-analysis-within-sas.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0ADQXw_eSp7ImA9Wx5TFEk.&quot;"><id>tag:blogger.com,1999:blog-29815492.post-5278479807387489988</id><published>2010-04-21T16:52:00.032-04:00</published><updated>2010-07-29T19:22:50.241-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2010-07-29T19:22:50.241-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="predictive modeling" /><category scheme="http://www.blogger.com/atom/ns#" term="Boost Algorithms" /><title>Improve the Boost macro from Prof. Rayens, W and Dr. Johnson, K</title><content type="html">
&lt;p&gt;&lt;a href="http://feedads.g.doubleclick.net/~a/GExQfALXS0VVCzBVeCRw_b94DJM/0/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/GExQfALXS0VVCzBVeCRw_b94DJM/0/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href="http://feedads.g.doubleclick.net/~a/GExQfALXS0VVCzBVeCRw_b94DJM/1/da"&gt;&lt;img src="http://feedads.g.doubleclick.net/~a/GExQfALXS0VVCzBVeCRw_b94DJM/1/di" border="0" ismap="true"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/p&gt;In Chapter 2 of the book "&lt;b&gt;&lt;i&gt;Pharmaceutical Statistics Using SAS: A Practical Guide&lt;/i&gt;&lt;/b&gt;" (SAS Press), Prof. Rayens, W and Dr. Johnson K. presented their SAS implementation of boost algorithms, including AdaBoost, RealBoost, GentleBoost and LogitBoost. The original SAS macro can be found at &lt;a href="http://ftp.sas.com/samples/A60622"&gt;Here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
Their macro uses PROC IML and is preferred by my colleagues since it is not I/O bounded as opposed to&amp;nbsp;my DATA STEP implementation. &lt;b&gt;One&amp;nbsp;shortcoming of their macro is inefficiency because I&amp;nbsp;think it is for&amp;nbsp;DEMONSTRATION purpose rather than for serious applications&lt;/b&gt;. The way they wrote the program shows too many redundent calculations as well as inefficient matrix operation.&amp;nbsp;In return,&amp;nbsp;their macro is not able to handle a data set with more than 5000 observations, and on my PC [Intel E6750 2.66GHz, 7.4Gflops/core, 4G memory], it took 1m41s to go over 10 iterations on a data set with 2000 observations and 10 numerical variables and consumed 158MB memory. Note that the resource consumptions increase quadratically in the number of observations. In another experiment with 4000 observations, the macro&amp;nbsp;consumed 752MB memory and took 6m53s to go over 10 iterations on 10 variables. My colleagues asked me if I could make some improvement so that this SAS implementation is usable in industrial applicaitons where data sets with &amp;gt;10K observations and hundreds of features&amp;nbsp;are more than common.&lt;br /&gt;
&lt;br /&gt;
In their implementation, an upper triangle matrix and a lower triangle matrix are used to obtain cumulative weighted sum from either direction of the sorted data, which increases memory consumption and&amp;nbsp;calculation time in quadratic rates. But in PROC IML, there is a built-in function called CUSUM that can be used to obtain cumulative sum lightning fast. In order to use CUSUM function to replace the cumbersome matrix operation, we also need to pay attention to the dimension change. Since there is no huge triangle matrices involves, the consumption on RAM also significantly reduced, which in turn means we can process bigger data [see experiment below]. I did a test on the speed and memory consumption using AdaBoost&amp;nbsp;on 2000 observation and 10 variables, with 10 iterations, before and after the improvement we see:&lt;br /&gt;
&lt;br /&gt;
2000 Obs:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/S894iQVB56I/AAAAAAAAAR0/ALT7LpRDENk/s1600/speed.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/_slrAR0IXTL0/S894iQVB56I/AAAAAAAAAR0/ALT7LpRDENk/s640/speed.PNG" width="530" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
4000 Obs:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/S897Q4QRQWI/AAAAAAAAAR8/8gkbdARdqGA/s1600/speed4K.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://2.bp.blogspot.com/_slrAR0IXTL0/S897Q4QRQWI/AAAAAAAAAR8/8gkbdARdqGA/s640/speed4K.PNG" width="512" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Consider the difference showing above, with 2000 observations, new macro used less than 0.3s and merely 1.1MB memory while the original one used 102.17s and 158.3MB memory. With 4000 observations, original macro used 752MB and 6:53s, as comparison, new macro used 0.5s and 1.9MB memory. We see&amp;nbsp;that the memory and time consumption is no longer O(n^2) but rather O(n), where n is the number of observations. Both macro produced almost identical results in two experiments.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/S89jRVCCeuI/AAAAAAAAARU/0GGmazwnVsw/s1600/Results+Accuracy.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="332" src="http://1.bp.blogspot.com/_slrAR0IXTL0/S89jRVCCeuI/AAAAAAAAARU/0GGmazwnVsw/s640/Results+Accuracy.PNG" width="640" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
With 500K observations, the original macro was not able to proceed but the improved one finished successfully&amp;nbsp;in about&amp;nbsp;70s and took up only 194MB memory:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/_slrAR0IXTL0/S9BxhSDIQdI/AAAAAAAAASU/esKFoqfslMA/s1600/speed+large.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/_slrAR0IXTL0/S9BxhSDIQdI/AAAAAAAAASU/esKFoqfslMA/s640/speed+large.PNG" width="544" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;With the much reduced processing time, on top of the boost algorithms, analysts are now able to apply more computationally intensive algorithms, such as Bagging.&lt;br /&gt;
&lt;br /&gt;
I also tried to improve the REGSPLIT_IML subroutine. Yesterday, time usage&amp;nbsp;was&amp;nbsp;reduced 69%&amp;nbsp;[1m21s vs 25s in 2K records] comparing to&amp;nbsp;original macro but still increases quadratically with number of observation, memory consumption is much reduced and increase linearly with size of data. With careful study of the&amp;nbsp;calculation and formula involved, I further decomposed the matrix operation&amp;nbsp;into more efficient mathematical calculations, and now the time usage is further reduced to only 0.9% [329.84s vs 2.59s in 4K records] of original macro and increase only linearly with the number of records. Therefore we are practically able to use GentleBoost&amp;nbsp;and LogitBoost [preferred over AdaBoost/RealBoost in many cases] for predictive modeling projects.&lt;br /&gt;
&lt;br /&gt;
First Improvement&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_slrAR0IXTL0/S8-S25FQqII/AAAAAAAAASM/U-E2V-pRzpw/s1600/reg+improve.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://1.bp.blogspot.com/_slrAR0IXTL0/S8-S25FQqII/AAAAAAAAASM/U-E2V-pRzpw/s640/reg+improve.PNG" width="556" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div align="left" class="separator" style="clear: both; text-align: center;"&gt;&lt;br /&gt;
&lt;/div&gt;Second Improvement:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/S9CW_CyAGsI/AAAAAAAAASc/he4UK865WIU/s1600/reg+improve2.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S9CW_CyAGsI/AAAAAAAAASc/he4UK865WIU/s640/reg+improve2.PNG" width="534" wt="true" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Using the most recent version of the macro, on a PC with E6320 1.86GHz (5.5Gflps/core) and 4GB memory, we observe the following performance benchmark (another 15% improvement over what's shown above):&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_slrAR0IXTL0/S974TgqIW6I/AAAAAAAAAS0/mqhkg8xCn48/s1600/benchmarktable.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="440" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S974TgqIW6I/AAAAAAAAAS0/mqhkg8xCn48/s640/benchmarktable.PNG" tt="true" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_slrAR0IXTL0/S974VmH3N1I/AAAAAAAAAS8/XgHuJpKFe6c/s1600/benchmarkfigure.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="242" src="http://2.bp.blogspot.com/_slrAR0IXTL0/S974VmH3N1I/AAAAAAAAAS8/XgHuJpKFe6c/s640/benchmarkfigure.PNG" tt="true" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Replace SPLIT_IML subroutine with the following code. &lt;br /&gt;
&lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/*************************************************************************
This is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 or 3 of the License
(at your option).

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
**************************************************************************/
    start split_iml(x,y,w,g_info,out_type, y_pred);
        n = nrow(x);
        p = ncol(x);
        gini_min = 2;
        gini_var = 0;
        gini_cut = 0;
        y_pred = repeat(0,n,1);
        wsum = sum(w);
        ywsum = sum(y#w);  
        ywsum1 = wsum - ywsum; 
        do j=1 to p;
            x_curr = x[,j]||y||w;
            b=x_curr;  
            x_curr[rank(x[,j]),]=b;  free b;
            x_sort = x_curr[,1]; 
            y_sort = x_curr[,2]; 
            w_sort = x_curr[,3];
            yw_sort=(y_sort#w_sort);
            yw_sort1=(w_sort - yw_sort);        
            yw_cusum=cusum(yw_sort[1:(n-1)]);    

            lpwt = cusum(w_sort[1:(n-1)]);
            lpwt = lpwt#(lpwt &amp;gt;= 2*CONSTANT('SMALL')) + 
                   (lpwt &amp;lt; 2*CONSTANT('SMALL'))*2*CONSTANT('SMALL');
    
            p1_L = yw_cusum # (1/lpwt);
            gini = yw_cusum # (1-p1_L);
                       
            rpwt = wsum - lpwt; 
            rpwt = rpwt#(rpwt &amp;gt;= 2*CONSTANT('SMALL')) + 
                   (rpwt &amp;lt; 2*CONSTANT('SMALL'))*2*CONSTANT('SMALL');
    
            yw_cusum = ywsum - yw_cusum;
            p1_R = yw_cusum # (1/rpwt);
            
            gini = gini + yw_cusum # (1-p1_R);

            free lpwt  rpwt  yw_cusum  yw_sort1;

            g_min=gini[&amp;gt;&amp;lt;];  g_loc=gini[&amp;gt;:&amp;lt;];

            if g_min &amp;lt; gini_min then do;
                gini_min=g_min;
                gini_var = j;
                gini_cut = (x_sort[g_loc] + x_sort[g_loc+1]) / 2;
                p1_RH = p1_R[g_loc];
                p0_RH = 1-p1_R[g_loc];
                p1_LH = p1_L[g_loc];
                p0_LH = 1-p1_L[g_loc];

                c_R = 0;
                if p1_RH &amp;gt; 0.5 then c_R = 1;
                c_L = 0;
                if p1_LH &amp;gt; 0.5 then c_L = 1;
            end;
        end;
        g_info = gini_var||gini_min||gini_cut||p0_LH||p1_LH||c_L||p0_RH||p1_RH||c_R;
        if out_type = 1 then 
           y_pred = (x[, gini_var] &amp;lt;=gini_cut)*c_L + 
                    (x[, gini_var] &amp;gt; gini_cut) *c_R
        ;
       
        if out_type=2 then
           y_pred[, 1] =( x[, gini_var]&amp;lt;=gini_cut) * ( (c_L=0)*(1-p0_LH) + (c_L=1)*p1_LH) +
                        ( x[, gini_var] &amp;gt;  gini_cut) * ( (c_R=0)*(1-p0_RH) + (c_R=1)*p1_RH)
        ;
 
 
    finish split_iml;
&lt;/code&gt;&lt;/pre&gt;Replace REGSPLIT_IML subroutine with the following code. &lt;br /&gt;
&lt;pre style="background-color: #ebebeb; border: 1px dashed rgb(153, 153, 153); color: #000001; font-family: Andale Mono,Lucida Console,Monaco,fixed,monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;"&gt;&lt;code&gt;
/*************************************************************************
This is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 or 3 of the License
(at your option).

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
**************************************************************************/
start regsplit_iml(x,y,w,j_info,y_pred);
        n = nrow(x);
        p = ncol(x);
        min_css = 10000000000000;
        y_pred = repeat(0,n,1);
  wy2sum = sum( w#y#y );
        wsum = sum(w);
        ywsum = sum(y#w);
  ywsum1 = wsum - ywsum;
        do j=1 to p;
            x_curr = x[,j]||y||w;
            b=x_curr;
            x_curr[rank(x[,j]),]=b;   free b;
            x_sort = x_curr[,1];
            y_sort = x_curr[,2];
            w_sort = x_curr[,3];

   yw_sort=(y_sort#w_sort);
   yw_sort1=((1-y_sort)#w_sort);
   w_sort = (w_sort);

   yw_cusum = cusum(yw_sort[1:(n-1)]);

   lpwt = cusum(w_sort[1:(n-1)]);
   lpwt = lpwt# (lpwt&gt;constant('SMALL')) + 
                         constant('SMALL')#(lpwt&lt;=constant('SMALL'));
   p1_L = (yw_cusum # (1/lpwt));

   rpwt = wsum - lpwt;
   rpwt = rpwt#(rpwt&gt;constant('MACEPS')) + 
                         constant('MACEPS')#(lpwt&lt;=constant('MACEPS'));
   p1_R = ((ywsum - yw_cusum) # (1/rpwt)); 

   css=(1:n-1)*0;
   lpwt = cusum(w_sort); rpwt = cusum(yw_sort);

   css = wy2sum + p1_L##2#lpwt[1:(n-1)] + p1_R##2#(wsum - lpwt[1:(n-1)]) -
           2*(p1_L#rpwt[1:(n-1)] + p1_R#(ywsum - rpwt[1:(n-1)]));

   free  lpwt  rpwt  yw_cusum  yw_sort1;
   css_min=css[&gt;&lt;];  css_loc=css[&gt;:&lt;];

            if css_min &lt; min_css then do;
                min_css = css_min;
                cut_val = (x_sort[css_loc] + x_sort[css_loc+1]) / 2;
                reg_var = j;
                ypred_L = (sum(yw_sort[1:css_loc]))/sum(w_sort[1:css_loc]);
                ypred_R = (sum(yw_sort[css_loc+1:n]))/
                        sum(w_sort[css_loc+1:n]);
                y_pred = ypred_L*(x[,j] &lt; cut_val) + ypred_R*(x[,j] &gt;= cut_val);
                j_info = reg_var||min_css||cut_val||ypred_L||ypred_R;
            end;
        end;
    finish regsplit_iml;
&lt;/code&gt;&lt;/pre&gt;&lt;i&gt;&lt;b&gt;Reference:&lt;/b&gt;&lt;/i&gt;&lt;br /&gt;
Dmitrienko, Alex, Christy Chuang-Stein, and Ralph D’Agostino.&amp;nbsp; &lt;b&gt;&lt;i&gt;Pharmaceutical Statistics Using SAS®: A Practical Guide&lt;/i&gt;&lt;/b&gt;. Cary, NC: SAS Institute Inc.&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=159047886X" style="border: medium none; margin: 0px; padding: 0px ! important;" width="1" /&gt;&amp;nbsp;2007&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.amazon.com/Pharmaceutical-Statistics-Using-SAS-Practical/dp/159047886X?ie=UTF8&amp;amp;tag=xie1978&amp;amp;link_code=bil&amp;amp;camp=213689&amp;amp;creative=392969" imageanchor="1" target="_blank"&gt;&lt;img alt="Pharmaceutical Statistics Using SAS: A Practical Guide (SAS Press)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&amp;amp;ServiceVersion=20070822&amp;amp;ID=AsinImage&amp;amp;WS=1&amp;amp;Format=_SL160_&amp;amp;ASIN=159047886X&amp;amp;tag=xie1978" /&gt;&lt;/a&gt;&lt;img alt="" border="0" height="1" src="http://www.assoc-amazon.com/e/ir?t=xie1978&amp;amp;l=bil&amp;amp;camp=213689&amp;amp;creative=392969&amp;amp;o=1&amp;amp;a=159047886X" style="border: medium none; margin: 0px; padding: 0px ! important;" width="1" /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-5278479807387489988?l=www.sas-programming.com' alt='' /&gt;&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/SasProgramming/~4/vyvOO66DRKo" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://www.sas-programming.com/feeds/5278479807387489988/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=29815492&amp;postID=5278479807387489988" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5278479807387489988?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/29815492/posts/default/5278479807387489988?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/SasProgramming/~3/vyvOO66DRKo/improve-boost-macro-from-rayens-w-and.html" title="Improve the Boost macro from Prof. Rayens, W and Dr. Johnson, K" /><author><name>Liang  Xie</name><uri>http://www.blogger.com/profile/02274752582289554390</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="24" src="http://3.bp.blogspot.com/_slrAR0IXTL0/S4Giu9mo1RI/AAAAAAAAANc/94tvwOXlvtI/S220/img110.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/_slrAR0IXTL0/S894iQVB56I/AAAAAAAAAR0/ALT7LpRDENk/s72-c/speed.PNG" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://www.sas-programming.com/2010/04/improve-boost-macro-from-rayens-w-and.html</feedburner:origLink></entry></feed>

