<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:creativeCommons="http://backend.userland.com/creativeCommonsRssModule" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" version="2.0">

<channel>
	<title>Daniel Lemire's blog</title>
	
	<link>http://www.daniel-lemire.com/blog</link>
	<description>Computer Scientist and Open Scholar: Databases, Information Retrieval, Business Intelligence.</description>
	<lastBuildDate>Tue, 07 Sep 2010 12:46:17 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.0</generator>
		<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/daniel-lemire/atom" /><feedburner:info uri="daniel-lemire/atom" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><geo:lat>45</geo:lat><geo:long>-73</geo:long><creativeCommons:license>http://creativecommons.org/licenses/by-nc-sa/2.0/</creativeCommons:license><feedburner:emailServiceId>daniel-lemire/atom</feedburner:emailServiceId><feedburner:feedburnerHostname>http://feedburner.google.com</feedburner:feedburnerHostname><feedburner:feedFlare href="http://www.bloglines.com/sub/http://feeds.feedburner.com/daniel-lemire/atom" src="http://www.bloglines.com/images/sub_modern11.gif">Subscribe with Bloglines</feedburner:feedFlare><feedburner:feedFlare href="http://fusion.google.com/add?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://buttons.googlesyndication.com/fusion/add.gif">Subscribe with Google</feedburner:feedFlare><feedburner:feedFlare href="http://www.plusmo.com/add?url=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://plusmo.com/res/graphics/fbplusmo.gif">Subscribe with Plusmo</feedburner:feedFlare><feedburner:feedFlare href="http://www.thefreedictionary.com/_/hp/AddRSS.aspx?http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://img.tfd.com/hp/addToTheFreeDictionary.gif">Subscribe with The Free Dictionary</feedburner:feedFlare><feedburner:feedFlare href="http://www.bitty.com/manual/?contenttype=rssfeed&amp;contentvalue=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.bitty.com/img/bittychicklet_91x17.gif">Subscribe with Bitty Browser</feedburner:feedFlare><feedburner:feedFlare href="http://www.newsalloy.com/?rss=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.newsalloy.com/subrss3.gif">Subscribe with NewsAlloy</feedburner:feedFlare><feedburner:feedFlare href="http://www.live.com/?add=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://tkfiles.storage.msn.com/x1piYkpqHC_35nIp1gLE68-wvzLZO8iXl_JMledmJQXP-XTBOLfmQv4zhj4MhcWEJh_GtoBIiAl1Mjh-ndp9k47If7hTaFno0mxW9_i3p_5qQw">Subscribe with Live.com</feedburner:feedFlare><feedburner:feedFlare href="http://mix.excite.eu/add?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://image.excite.co.uk/mix/addtomix.gif">Subscribe with Excite MIX</feedburner:feedFlare><feedburner:feedFlare href="http://download.attensa.com/app/get_attensa.html?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.attensa.com/blogs/attensa/WindowsLiveWriter/BadgeredintoBadges_10C02/attensa_feed_button5.gif">Subscribe with Attensa for Outlook</feedburner:feedFlare><feedburner:feedFlare href="http://www.webwag.com/wwgthis.php?url=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.webwag.com/images/wwgthis.gif">Subscribe with Webwag</feedburner:feedFlare><feedburner:feedFlare href="http://www.podcastready.com/oneclick_bookmark.php?url=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.podcastready.com/images/podcastready_button.gif">Subscribe with Podcast Ready</feedburner:feedFlare><feedburner:feedFlare href="http://www.flurry.com/pushRssFeed.do?r=fb&amp;url=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.flurry.com/images/flurry_rss_logo2.gif">Subscribe with Flurry</feedburner:feedFlare><feedburner:feedFlare href="http://www.wikio.com/subscribe?url=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.wikio.com/shared/img/add2wikio.gif">Subscribe with Wikio</feedburner:feedFlare><feedburner:feedFlare href="http://www.dailyrotation.com/index.php?feed=http%3A%2F%2Ffeeds.feedburner.com%2Fdaniel-lemire%2Fatom" src="http://www.dailyrotation.com/rss-dr2.gif">Subscribe with Daily Rotation</feedburner:feedFlare><item>
		<title>How reliable is science?</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/TZniGgTt_A0/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/09/06/how-reliable-is-science/#comments</comments>
		<pubDate>Mon, 06 Sep 2010 18:18:26 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2708</guid>
		<description>It is not difficult find instances of fraud in science: Ranjit Chandra faked medical research results. He pocketed the money meant for running the experiments. Woo-suk Hwang faked human cloning, among other terrible things. Jan Hendrik Schön faked a transistor at the molecular level. How did these people fare after being caught? Ranjit Chandra still [...]</description>
			<content:encoded><![CDATA[<p>It is not difficult find instances of fraud in science:</p>
<ul>
<li> <a href="http://en.wikipedia.org/wiki/Ranjit_Chandra">Ranjit Chandra</a> faked medical research results. He pocketed the money meant for running the experiments.</li>
<li> <a href="http://en.wikipedia.org/wiki/Hwang_Woo-Suk#Controversies">Woo-suk Hwang</a> faked human cloning, among other terrible things.</li>
<li> <a href="http://en.wikipedia.org/wiki/Sch%C3%B6n_scandal">Jan Hendrik Schön</a> faked a transistor at the molecular level.</li>
</ul>
<p>How did these people fare after being caught?</p>
<ul>
<li> Ranjit Chandra still holds the <a href="http://en.wikipedia.org/wiki/Category:Officers_of_the_Order_of_Canada">Order of Canada</a>, as far as I can tell.  According to Scopus, his 272 research papers were cited over 3000 times. As for his University? Let me quote wikipedia:  <em>University officials claimed that the university was unable to make a case for research fraud because the raw data on which a proper evaluation could be made had gone missing. Because the accusation was that the data did not exist, this was a puzzling rationale.</em></li>
<li>According to Scopus, Woo-suk Hwang has been cited over 2000 times. Despite having faked research results and having committed major ethics violations, he has kept his job and&#8230; he is still <a href="http://www.ncbi.nlm.nih.gov/pubmed/19996555">publishing</a>.</li>
<li>Despite all the retracted papers, Jan Hendrik Schön has still 1,200 citations according to Scopus. He lost his research job, but found an engineering position in Germany.</li>
</ul>
<p><strong>Conclusion</strong>: Scientific fraud is a low-risk, high-reward activity.</p>
<p>What is more critical is that we still equate peer review with correctness. The argument usually goes as follows: if it is important work, work that people rely upon, and it has been peer reviewed, then it must be correct. In sum, we think that conventional peer review + citations means validation. I think we are wrong:</p>
<ul>
<li><strong>Conventional peer review is shallow.</strong> Chandra, Hwang and Schön published faked results for many years in the most prestigious venues. The truth is that reviewers do not reproduce results. They usually do not have access to the raw data and software. And even if they did, they are unlikely to be motivated to redo all of the work to verify it.</li>
<li><strong>Citations are not validations.</strong> Chandra, Hwang and Schön were generously cited. It is hardly surprising: impressive results are more likely to be cited. And doctored results are usually more impressive. Yet, scientists do not reproduce earlier work. Even if you do try to reproduce someone&#8217;s result, and fail, you probably won&#8217;t publish it. Indeed, publishing negative results is hard: journals are not interested. Moreover, there is a risk that it may backfire: the authors could go on the offensive. They could question your own competence.</li>
<li><strong>There are many small frauds.</strong> Even without making up data, you can cheat by misleading the reader, by omission. You can present the data in creative ways, e.g. turn meaningless averages into hard facts by omitting the variance (see the <a href="http://www.daniel-lemire.com/blog/archives/2010/06/18/the-fallacy-of-absolute-numbers/">fallacy of absolute numbers</a>). These small frauds increase the likelihood that your paper will be accepted and then generously cited.</li>
</ul>
<p>How do we solve the problem? (1) By <a href="http://www.daniel-lemire.com/blog/archives/2008/10/28/when-in-doubts-prefer-unimpressive-negative-results/">trusting unimpressive results</a> more than impressive ones. (2) By being suspicious of popular trends. (3) By running our own experiments.</p>
<p><strong>Further reading</strong>: <a href="http://www.daniel-lemire.com/blog/archives/2009/10/26/become-independent-of-peer-review/">Become independent of peer review</a>, <a href="http://www.daniel-lemire.com/blog/archives/2009/01/09/the-purpose-of-peer-review/">The purpose of peer review</a> and <a href="http://www.daniel-lemire.com/blog/archives/2008/08/21/peer-review-is-an-honor-based-system/">Peer review is an honor-based system</a>.</p>
<p><strong>Source</strong>: <a href="http://www.blog.sethroberts.net/2010/09/05/plastic-fantastic-by-e-s-reich/">Seth Roberts</a>.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=TZniGgTt_A0:XKEwLSwynZA:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=TZniGgTt_A0:XKEwLSwynZA:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/TZniGgTt_A0" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/09/06/how-reliable-is-science/feed/</wfw:commentRss>
		<slash:comments>12</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/09/06/how-reliable-is-science/</feedburner:origLink></item>
		<item>
		<title>Write a Twitter application in 5 minutes</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/JVpVudcG6js/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/08/30/write-a-twitter-application-in-5-minutes/#comments</comments>
		<pubDate>Mon, 30 Aug 2010 14:28:37 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Science and Technology]]></category>
		<category><![CDATA[Software design]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2702</guid>
		<description>I spend much time alone, writing and thinking. Twitter helps me stay connected. I love the platform. On Friday, I wanted to find the intersection between the users followed by any two individuals. Indeed, suppose that you like both Joe and Jill, and they have similar interests. Maybe whoever they both read is also interesting? I could [...]</description>
			<content:encoded><![CDATA[<p>I spend much time alone, writing and thinking. <a href="http://twitter.com/lemire">Twitter</a> helps me stay connected. I love the platform.</p>
<p>On Friday, I wanted to find the intersection between the users followed by any two individuals. Indeed, suppose that you like both Joe and Jill, and they have similar interests. Maybe whoever they both read is also interesting? I could not find a tool to do it, so I built it with <a href="http://code.google.com/p/python-twitter/">python-twitter</a>.</p>
<p>Anybody with a working knowledge of Python can do it in less than 5 minutes. I used only <a href="http://pastebin.com/6wTMhU3Q">twenty lines of code</a> (in total!!!). The code proved immediately useful.</p>
<p>If you do not know <a href="http://www.python.org/">Python</a> or <a href="http://www.ruby-lang.org/en/">Ruby</a>. Learn one or the other. Tonight. It is powerful stuff.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=JVpVudcG6js:VxqFDvYggSw:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=JVpVudcG6js:VxqFDvYggSw:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/JVpVudcG6js" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/08/30/write-a-twitter-application-in-5-minutes/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/08/30/write-a-twitter-application-in-5-minutes/</feedburner:origLink></item>
		<item>
		<title>Manifesto for Half-Arsed Academic Research</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/GZSQNgUq7xc/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/08/30/manifesto-for-half-arsed-academic-research/#comments</comments>
		<pubDate>Mon, 30 Aug 2010 14:10:22 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2698</guid>
		<description>Research results are more important than the number of publications or citations. This is fine. Yet, we don&amp;#8217;t have time to read your papers. So, just keep publishing a lot of papers each year. And get your influential friends to cite you. That&amp;#8217;s how we&amp;#8217;ll know whether you are good. Science and truth are more important [...]</description>
			<content:encoded><![CDATA[<ul>
<li><strong>Research results are more important than the number of publications or citations</strong>.<br />
This is fine. Yet, we don&#8217;t have time to read your papers. So, just keep publishing a lot of papers each year. And get your influential friends to cite you. That&#8217;s how we&#8217;ll know whether you are good.</li>
<li> <strong>Science and truth are more important than spin and marketing.</strong><br />
Yes, but keep pretending you will solve world hunger. And align your research results with the current fashionable trends.</li>
<li> <strong>You cannot tell where the next science breakthrough is going to come from.</strong><br />
Maybe. Still, we want a plan of your research activities for the next five years.</li>
</ul>
<p><strong>Further reading</strong>: <a href="http://www.daniel-lemire.com/blog/archives/2009/09/15/the-hard-truth-about-research-grants/">The hard truth about research grants</a> and <a href="http://www.daniel-lemire.com/blog/archives/2009/10/28/the-secret-behind-radical-innovation/">The secret behind radical innovation</a>.</p>
<p><strong>Source</strong> : <a href="http://www.halfarsedagilemanifesto.org/">Manifesto for Half-Arsed Agile Software Development</a> via <a href="http://twitter.com/JohnDCook/statuses/22522316230">John D. Cook</a>.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=GZSQNgUq7xc:B7Y0TEsBBHc:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=GZSQNgUq7xc:B7Y0TEsBBHc:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/GZSQNgUq7xc" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/08/30/manifesto-for-half-arsed-academic-research/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/08/30/manifesto-for-half-arsed-academic-research/</feedburner:origLink></item>
		<item>
		<title>Counterintuitive factors determining research productivity</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/lWJDpzbE_f4/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/08/23/counterintuitive-factors-determining-research-productivity/#comments</comments>
		<pubDate>Mon, 23 Aug 2010 14:08:51 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2693</guid>
		<description>Permanent researchers publish more when they are in smaller labs. Having many Ph.D. students fails to improve productivity. Funding has little effect on research productivity. Reference: Carayol, N. and Matt, M., Individual and collective determinants of academic scientists&amp;#8217; productivity, Information Economics and Policy 18 (1), 2006. Further reading (on this blog): To be smarter, ignore external [...]</description>
			<content:encoded><![CDATA[<ul>
<li>Permanent researchers publish more when they are in smaller labs.</li>
<li>Having many Ph.D. students fails to improve productivity.</li>
<li>Funding has little effect on research productivity.</li>
</ul>
<p><strong>Reference</strong>: Carayol, N. and Matt, M., <a href="http://ideas.repec.org/a/eee/iepoli/v18y2006i1p55-72.html">Individual and collective determinants of academic scientists&#8217; productivity</a>, Information Economics and Policy 18 (1), 2006.</p>
<p><strong>Further reading (on this blog)</strong>: <a href="http://www.daniel-lemire.com/blog/archives/2009/08/26/to-be-smarter-ignore-external-rewards/">To be smarter, ignore external rewards</a>, <a href="http://www.daniel-lemire.com/blog/archives/2009/07/02/is-collaboration-correlated-with-productivity/">Is collaboration correlated with productivity?</a>, <a href="http://www.daniel-lemire.com/blog/archives/2006/08/09/big-schools-are-not-longer-giving-researchers-an-edge/">Big schools are no longer giving researchers an edge?</a></p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=lWJDpzbE_f4:bbEE5gvCfKU:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=lWJDpzbE_f4:bbEE5gvCfKU:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/lWJDpzbE_f4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/08/23/counterintuitive-factors-determining-research-productivity/feed/</wfw:commentRss>
		<slash:comments>14</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/08/23/counterintuitive-factors-determining-research-productivity/</feedburner:origLink></item>
		<item>
		<title>Working long hours is stupid</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/nv4WjDHlCOQ/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/08/16/working-long-hours-is-stupid/#comments</comments>
		<pubDate>Mon, 16 Aug 2010 14:52:20 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Family and Health]]></category>
		<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2673</guid>
		<description>We do too much. We carry too many projects. This overproduction creates problems which we try to fix by working even more. We value most what we create (see Made by hand and The upside of irrationality). To be happy, you want to focus on making interesting stuff. This takes time and dedication. Yet, as [...]</description>
			<content:encoded><![CDATA[<p>We do too much. We carry too many projects. This overproduction creates problems which we try to fix by working even more.</p>
<p>We value most what we create (see <a href="http://www.amazon.com/Made-Hand-Searching-Meaning-Throwaway/dp/1591843324/ref=sr_1_1?ie=UTF8&#038;s=books&#038;qid=1281966845&#038;sr=8-1">Made by hand</a> and <a href="http://www.amazon.com/Upside-Irrationality-Unexpected-Benefits-Defying/dp/0061995037/ref=sr_1_1?s=books&#038;ie=UTF8&#038;qid=1281966894&#038;sr=1-1">The upside of irrationality</a>). To be happy, you want to focus on <strong>making interesting stuff</strong>. This takes time and dedication. Yet, as Graham&#8217;s essay <a href="http://www.paulgraham.com/top.html">The top idea in  your mind</a> stresses, we often fall into the trap of thinking mostly about money and personal disputes. These thoughts pull us away from our interests and prevent us from doing great work. As an example, I hear that <a href="http://en.wikipedia.org/wiki/Tiger_Woods">Tiger Woods</a> isn&#8217;t playing great golf. I bet he is either stuck into money problems or personal disputes, or both.</p>
<p>It is hard to be overworked by writing a book, by writing research articles or by playing golf. People are overworked dealing with email, context switching, money, and touchy relationships. This abundance of work makes people sad and boring. And this type of work tends to reproduce. The more you have, the more you will have.</p>
<p>Unemployment and pollution are visible results of our overproduction. Yet, there are many more negative side effects. In academia, we train more and more Ph.D.s every year. Yet, we have had too many Ph.D.s in the job market since the seventies. We write more and more research papers every year, and spend more and more time applying for research grants&#8230; but professors spend less and less time on curiosity-driven research.</p>
<p>It is cool to produce great work, but it is not cool to work 60 hours a week unless it is out of passion.  And nobody is passionate about grant applications, marking papers or handling difficult people. Moreover, working long hours does not scale: you can&#8217;t increase your output continuously.</p>
<p>Our productivity will keep improving. I can write software faster and better than ever. I can research prior work with ease. I can ask fancy mathematical questions on the Web and get answers in minutes. Instead of investing back this productivity into more silly work, we need to get smarter:</p>
<ul>
<li>Focus on the essential: programming great software, writing a fun book, a set of inspiring lecture notes or an insightful article.</li>
<li><a href="http://en.wikipedia.org/wiki/The_4-Hour_Workweek">Automate, reduce or delegate</a>. Reduce is best: doing fewer things is cool!</li>
<li>A focus on money or on personal disputes makes you stupid. Yet, that&#8217;s where <em>success</em> often takes you. Watch out!</li>
<li>Airplanes pollute. Travel takes you away from your family. Cars pollute and make you fat. Do you need all that junk?</li>
</ul>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=nv4WjDHlCOQ:cT7OvbXCvnY:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=nv4WjDHlCOQ:cT7OvbXCvnY:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/nv4WjDHlCOQ" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/08/16/working-long-hours-is-stupid/feed/</wfw:commentRss>
		<slash:comments>17</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/08/16/working-long-hours-is-stupid/</feedburner:origLink></item>
		<item>
		<title>How to get everyone talking about your research!</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/Y4as8ozvuvQ/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/08/09/how-to-get-everyone-talking-about-your-research/#comments</comments>
		<pubDate>Mon, 09 Aug 2010 13:09:46 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2676</guid>
		<description>Deolalikar claims to have solved the famous P versus NP problem. Is the proof correct? Some influential researchers doubt it: Scott Aaronson is betting 200k$ of his own money against Deolalikar. What I find most interesting is that Deolalikar did not submit the paper to a journal, as far as I know. He didn&amp;#8217;t even post [...]</description>
			<content:encoded><![CDATA[<p><a href="http://en.wikipedia.org/wiki/Vinay_Deolalikar">Deolalikar</a> <a href="http://www.scribd.com/doc/35539144/pnp12pt">claims to have solved</a> the famous <a href="http://en.wikipedia.org/wiki/P_versus_NP_problem">P versus NP problem</a>. Is the proof correct? Some influential researchers doubt it: Scott  Aaronson is <a href="http://scottaaronson.com/blog/?p=456">betting</a> 200k$ of his own money against Deolalikar.</p>
<p>What I find most interesting is that Deolalikar did not submit the paper to a journal, as far as I know. He didn&#8217;t even post it on <a href="http://arxiv.org/">arxiv</a> like <a href="http://en.wikipedia.org/wiki/Grigori_Perelman">Perelman</a>. Yet, he is receiving much attention. <a href="http://twitter.com/#search?q=Deolalikar">His name is being tweeted</a> several times a minute. Many of the most influential theoretical computer scientists are reacting to the paper. He is getting the best peer review possible. Most similar papers don&#8217;t get so much attention.</p>
<p>Why is this paper different?</p>
<ul>
<li>Everyone seems to agree that the paper is well written, it has nice (color!) figures and the reference section appears up-to-date and complete.  <strong>If your result is important, communicate it well.</strong></li>
<li>Deolalikar has published just a handful of papers in theoretical computer science, and none at the major conferences. <strong>But </strong><strong>he has enough peer-reviewed research papers to be treated as a peer.</strong></li>
<li>While I doubt he was hired to work on complexity theory, Deolalikar  is an industry <a href="http://www.hpl.hp.com/personal/Vinay_Deolalikar/">researcher at HP</a>.  <strong>Being paid to do research might make you more credible</strong>.</li>
</ul>
<p><strong>Further reading:</strong> <a href="http://www.informatik.uni-trier.de/~ley/db/indices/a-tree/d/Deolalikar:Vinay.html">Deolalikar&#8217;s publication list</a> on DBLP,  <a href="http://rjlipton.wordpress.com/2010/08/08/a-proof-that-p-is-not-equal-to-np/">A Proof That P Is Not Equal To NP?</a> by Lipton and <a href="http://gregbaker.ca/blog/2010/08/07/p-n-np/">P ≠ NP</a> by Baker.</p>
<p><strong>Update:</strong> Porreca has <a href="http://aeporreca.org/2010/08/09/proof-that-p-isnt-np/">the best write-up</a> on reactions to this paper.</p>
<p><strong>Update 2:</strong> The consensus after two weeks is that the proof wrong and unfixable.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=Y4as8ozvuvQ:VZ1d2-dLbU8:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=Y4as8ozvuvQ:VZ1d2-dLbU8:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/Y4as8ozvuvQ" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/08/09/how-to-get-everyone-talking-about-your-research/feed/</wfw:commentRss>
		<slash:comments>14</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/08/09/how-to-get-everyone-talking-about-your-research/</feedburner:origLink></item>
		<item>
		<title>Is multiplication slower than addition?</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/fJJSYKS_4sA/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/07/19/is-multiplication-slower-than-addition/#comments</comments>
		<pubDate>Mon, 19 Jul 2010 17:09:34 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2659</guid>
		<description>Earlier, I asked whether integer addition was faster than bitwise exclusive or. My tests showed no difference, and nobody contradicted me. However, everyone knows that multiplication is slower than addition? Right? In cryptography, there are many papers on how to trade multiplications for additions, to speed up software. So? Can you predict which piece of [...]</description>
			<content:encoded><![CDATA[<p>Earlier, I <a href="http://www.daniel-lemire.com/blog/archives/2010/03/12/which-is-fastest-integer-addition-or-xor/">asked</a> whether integer addition was faster than bitwise exclusive or. My tests showed no difference, and nobody contradicted me.</p>
<p>However, everyone knows that multiplication is slower than addition? Right? In cryptography, there are many papers on how to trade multiplications for additions, to speed up software.</p>
<p>So? Can you predict which piece of code runs faster?</p>
<p><strong>scalar product (N multiplications):</strong><br />
<code><br />
for(int k =0; k &lt; N ; ++k)<br />
answer += vector1[k] * vector2[k];<br />
</code></p>
<p><strong>scalar product two-by-two (N multiplications):</strong><br />
<code> for(int k =0; k &lt; N ; k+=2)<br />
answer += vector1[k] * vector2[k]<br />
+vector1[k+1] * vector2[k+1];</code></p>
<p><strong>non-standard scalar product (N/2 multiplications):</strong><code><br />
for(int k =0; k &lt; N ; k+=2)<br />
answer += ( vector1[k] + vector2[k] )<br />
* ( vector1[k+1] + vector2[k+1] );<br />
</code></p>
<p><strong>just additions (no multiplication):</strong><code><br />
for(int k =0; k &lt; N ; ++k)<br />
answer += vector1[k] + vector2[k];<br />
</code></p>
<p><strong>Answer:</strong> Merely reducing the number of multiplications has no benefit, in these tests. Hence, simple computational cost models (such as counting the number of multiplications) may not hold on modern <a href="http://en.wikipedia.org/wiki/Superscalar">superscalar</a> processors.</p>
<p>My results using GNU GCC 4.2.1 on both a desktop and a laptop:</p>
<table border="1">
<tbody>
<tr>
<th>algorithm</th>
<th>Intel Core i7</th>
<th>Intel Core 2 Duo</th>
</tr>
<tr>
<td>scalar product</td>
<td>0.30</td>
<td>0.39</td>
</tr>
<tr>
<td>scalar product (2&#215;2)</td>
<td>0.25</td>
<td>0.39</td>
</tr>
<tr>
<td>fewer multiplications</td>
<td>0.25</td>
<td>0.39</td>
</tr>
<tr>
<td>just additions</td>
<td>0.16</td>
<td>0.23</td>
</tr>
</tbody>
</table>
<p>Times are in seconds. The source code is available <a href="http://pastebin.com/cdMMLMZm">without pointer arithmetics</a>.  The same test with pointer arithmetics gives faster results, but the same conclusion. I tried a <a href="http://pastebin.com/YxfVcvue">similar experiment</a> in Java. It confirms my result.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=fJJSYKS_4sA:Hl2Kr1258Yw:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=fJJSYKS_4sA:Hl2Kr1258Yw:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/fJJSYKS_4sA" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/07/19/is-multiplication-slower-than-addition/feed/</wfw:commentRss>
		<slash:comments>14</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/07/19/is-multiplication-slower-than-addition/</feedburner:origLink></item>
		<item>
		<title>General versus domain intelligence</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/nRO8ewN2Clw/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/07/13/general-versus-domain-intelligence/#comments</comments>
		<pubDate>Tue, 13 Jul 2010 13:44:19 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2637</guid>
		<description>Our brains come with hard-wired algorithms. Cats can catch birds or mice without thinking about it. I can grab and eat a strawberry without thinking. The Savanna-IQ Interaction Hypothesis says that general intelligence may originally have evolved as a domain-specific adaptation to deal with evolutionarily novel, nonrecurrent problems. We can derive from this hypothesis that [...]</description>
			<content:encoded><![CDATA[<p>Our brains come with hard-wired algorithms. Cats can catch birds or mice without thinking about it. I can grab and eat a strawberry without thinking.  The <a href="http://www.psych-it.com.au/Psychlopedia/article.asp?id=331">Savanna-IQ Interaction Hypothesis</a> says that general intelligence may originally have evolved as a domain-specific adaptation to deal with evolutionarily novel, nonrecurrent problems.  We can derive from this hypothesis that people with better general intelligence won&#8217;t be better at routine tasks. In fact, they may fare worse at it! They may only have an edge for novel tasks. Thus, general and domain intelligence may be somewhat separate entities.</p>
<p>How do you recognize people with better general intelligence? They are better at adapting to new settings. They are the first to adopt new strategies. But they may not be very good at baseball or boxing, and they may be socially inept.</p>
<p>Modern Artificial Intelligence (and Machine Learning) is typically domain-specific. My spam filter can detect spam, but it won&#8217;t ever do anything else. Our software has <em>evolved</em> to cope with specific problems. Yet, we still lack software with general intelligence. Trying to build better spam filters may be orthogonal to achieving general intelligence in software. In fact, software with good general intelligence may not do so well at spam filtering.</p>
<p><strong>Reference</strong>: Satoshi Kanazawa, Kaja Perina, <a href="http://personal.lse.ac.uk/Kanazawa/pdfs/PAID2009.pdf">Why night owls are more intelligent</a>, Personality and Individual Differences 47 (2009) 685–690</p>
<p><strong>Further reading</strong>: <a href="http://apperceptual.wordpress.com/2008/10/25/language-cognition-and-evolution-modularity-versus-unity/">Language, Cognition, and Evolution: Modularity versus Unity</a> by Peter Turney</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=nRO8ewN2Clw:onvj3h0ACrA:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=nRO8ewN2Clw:onvj3h0ACrA:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/nRO8ewN2Clw" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/07/13/general-versus-domain-intelligence/feed/</wfw:commentRss>
		<slash:comments>10</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/07/13/general-versus-domain-intelligence/</feedburner:origLink></item>
		<item>
		<title>Summer reading: my recommendations</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/0DkobO00u6Q/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/07/09/summer-reading-my-recommendations/#comments</comments>
		<pubDate>Fri, 09 Jul 2010 21:22:31 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2629</guid>
		<description>Containment by Christian Cantrell is an excellent sci-fi novel. And you can grab it nearly for free from the author&amp;#8217;s page. The premise of the book is that humanity built a colony on Venus. Children  are told that Earth cannot be reached. Massive research into economical oxygen production is required for long term survival. Indeed, [...]</description>
			<content:encoded><![CDATA[<p><img style="float: left; margin: 5px; width: 100px;" src="http://www.livingdigitally.net/books/containment/containment_150x225.jpg" alt="containment" /><a href="http://www.livingdigitally.net/containment.html">Containment</a> by Christian Cantrell is an excellent sci-fi novel. And you can <a href="http://www.livingdigitally.net/containment.html">grab it nearly for free</a> from the author&#8217;s page. The premise of the book is that humanity built a colony on Venus. Children  are told that Earth cannot be reached. Massive research into economical oxygen production is required for long term survival. Indeed,  plants cannot survive on the surface of Venus. Or can they? Couldn&#8217;t we design special plants that could survive? One of the young researchers sets out to answer the question. Unfortunately, he won&#8217;t like the answer. The plot may not be extraordinary, but there are many things to like for computer nerds. For example, the book is set in a future where we appear to have cheap quantum computing. Or, at least, some very fast computers. One of the consequence is that any sufficiently smart kid can break any encryption. Moreover, it is cheaper to simulate most physical experiments than to actual execute them.</p>
<p><img style="float: left; margin: 5px; width: 100px;" src="http://photo.goodreads.com/books/1171481840m/101869.jpg" alt="atrocity archive" />The <a href="http://en.wikipedia.org/wiki/The_Atrocity_Archives">Atrocity Archives</a> by <a href="http://en.wikipedia.org/wiki/Charles_Stross">Charles Stross</a> is the first in an ongoing series of books. Stross was a software engineer, and it shows. His book reveals many secrets all Computer Scientists should know. For example, do you know why Knuth will never finish the <a href="http://en.wikipedia.org/wiki/The_Art_of_Computer_Programming">Art of Computer programming</a>, no matter what he tells us? Here&#8217;s a quote:</p>
<blockquote><p>The [Turing] theorem is a hack on discrete number theory that simultaneously disproves the Church-Turing hypothesis (wave if you understood that) and worse, permits NP-complete problems to be converted into P-complete ones. This has several consequences, starting with screwing over most cryptography algorithms—translation: all your bank account are belong to us—and ending with the ability to computationally generate a Dho-Nha geometry curve in real time.</p>
<p>This latter item is just slightly less dangerous than allowing nerds with laptops to wave a magic wand and turn them into hydrogen bombs at will. Because, you see, everything you know about the way this universe works is correct—except for the little problem that this isn&#8217;t the only universe we have to worry about. Information can leak between one universe and another. And in a vanishingly small number of other universes there are things that listen, and talk back—see Al-Hazred, Nietzsche, Lovecraft, Poe, et cetera. The many-angled ones, as they say, live at the bottom of the Mandelbrot set, except when a suitable incantation in the platonic realm of mathematics—computerised or otherwise—draws them forth. (And you thought running that fractal screensaver was good for your computer?)</p></blockquote>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=0DkobO00u6Q:9bcIYrVOmkI:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=0DkobO00u6Q:9bcIYrVOmkI:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/0DkobO00u6Q" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/07/09/summer-reading-my-recommendations/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/07/09/summer-reading-my-recommendations/</feedburner:origLink></item>
		<item>
		<title>The five most important algorithms?</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/lIgnUXwWaTE/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/07/05/the-five-most-important-algorithms/#comments</comments>
		<pubDate>Tue, 06 Jul 2010 01:42:14 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2621</guid>
		<description>Bernhard Koutschan posted a compilation of the most important algorithms. The goal is to determine the 5 most important algorithms. Out of his list, I would select the following five algorithms: Binary search is the first non-trivial algorithm I remember learning. The Fast Fourier transform (FFT) is an amazing algorithm. Combined with the Convolution theorem, [...]</description>
			<content:encoded><![CDATA[<p>Bernhard Koutschan posted a compilation of the <a href="http://www.risc.jku.at/people/ckoutsch/stuff/e_algorithms.html">most important algorithms</a>. The goal is to determine the 5 most important algorithms. Out of his list, I would select the following five algorithms:</p>
<ul>
<li><a href="http://en.wikipedia.org/wiki/Binary_search_algorithm">Binary search</a> is the first non-trivial algorithm I remember learning.</li>
<li>The <a href="http://en.wikipedia.org/wiki/Fast_Fourier_transform">Fast Fourier transform (FFT)</a> is an amazing algorithm. Combined with the <a href="http://en.wikipedia.org/wiki/Convolution_theorem">Convolution theorem</a>, it lets you do magic.</li>
<li>While <a href="http://en.wikipedia.org/wiki/Hash_function">hashing</a> is not an algorithm, it is one of the most powerful and useful idea in Computer Science. It takes minutes to explain it, but years to master.</li>
<li><a href="http://en.wikipedia.org/wiki/Merge_sort">Merge sort</a> is the most elegant sorting algorithm. You can explain it in three sentences to anyone.</li>
<li>While not an algorithm per se, the <a href="http://en.wikipedia.org/wiki/Singular_Value_Decomposition">Singular Value Decomposition</a> (SVD) is the most important Linear Algebra concept <em>I don&#8217;t remember learning as an undergraduate</em>. (And yes, I went to a <a href="http://www.math.toronto.edu/">good school</a>. And yes, I was an A student.) It can help you <a href="http://en.wikipedia.org/wiki/Pseudoinverse">invert singular matrices</a> and do other similar magic.</li>
</ul>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=lIgnUXwWaTE:FL7qQp4WwQ0:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=lIgnUXwWaTE:FL7qQp4WwQ0:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/lIgnUXwWaTE" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/07/05/the-five-most-important-algorithms/feed/</wfw:commentRss>
		<slash:comments>18</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/07/05/the-five-most-important-algorithms/</feedburner:origLink></item>
		<item>
		<title>NoSQL or NoJoin?</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/AP69y9-lxik/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/06/28/nosql-or-nojoin/#comments</comments>
		<pubDate>Mon, 28 Jun 2010 13:28:59 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Data Warehousing and OLAP]]></category>
		<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2614</guid>
		<description>Several major players built alternatives to conventional database systems: Google created BigTable, Amazon built Dynamo and Facebook initiated Cassandra. There are many other comparable open source initiatives such as CouchDB and MongoDB. These systems are part of a trend called NoSQL because it is not centered around the SQL language. While there has always been [...]</description>
			<content:encoded><![CDATA[<p>Several major players built alternatives to conventional database systems:  Google created <a href="http://en.wikipedia.org/wiki/BigTable">BigTable</a>, Amazon built <a href="http://en.wikipedia.org/wiki/Dynamo_(storage_system)">Dynamo</a> and Facebook initiated <a href="http://en.wikipedia.org/wiki/Apache_Cassandra">Cassandra</a>. There are many other comparable open source initiatives such as <a href="http://en.wikipedia.org/wiki/CouchDB">CouchDB</a> and  <a href="http://en.wikipedia.org/wiki/MongoDB">MongoDB</a>. These systems are part of a trend called <a href="http://en.wikipedia.org/wiki/Nosql">NoSQL</a> because it is not centered around the <a href="http://en.wikipedia.org/wiki/Sql">SQL</a> language. While there has always been non SQL-based database systems, the rising popularity of these alternatives in industry is drawing attention.</p>
<p>In <a href="http://cacm.acm.org/blogs/blog-cacm/50678-the-nosql-discussion-has-nothing-to-do-with-sql/fulltext">The &#8220;NoSQL&#8221; Discussion has Nothing to Do With SQL</a>, Stonebraker opposes the <a href="http://en.wikipedia.org/wiki/Nosql">NoSQL trend</a> in those terms:</p>
<blockquote><p>(&#8230;) blinding performance depends on removing overhead. Such overhead has nothing to do with SQL, but instead revolves around traditional implementations of ACID transactions, multi-threading, and disk management.</p></blockquote>
<p>In effect, Stonebraker says that all of the benefits of the NoSQL systems have nothing to do with ditching the SQL language.  Of course, because the current breed of SQL is Turing complete, it is difficult to argue against SQL at the formal level. In theory, all Turing complete languages are interchangeable. You can do everything (bad and good) in SQL.</p>
<p>However, in practice, SQL is based on joins and related low-level issues like foreign keys. SQL entices people to <a href="http://en.wikipedia.org/wiki/Database_normalization">normalize their data</a>. Normalization fragments databases into smaller tables which is great for data integrity and beneficial for some <a href="http://en.wikipedia.org/wiki/Database_transaction#Transactional_databases">transactional systems</a>. However, joins are expensive. Moreover, joins require strong consistency and fixed schemas.</p>
<p>In turn, avoiding join operations makes it possible to maintain flexible or informal schemas, and to <a href="http://en.wikipedia.org/wiki/Scalability#Scale_horizontally_.28scale_out.29">scale horizontally</a>. Thus, the NoSQL solutions should really be called NoJoin because they are mostly defined by avoidance of the <a href="http://en.wikipedia.org/wiki/Join_(SQL)">join operation</a>.</p>
<p>How do we compute joins? There are two main techniques :</p>
<ul>
<li>When dealing with large tables, you may prefer the <a href="http://en.wikipedia.org/wiki/Sort-merge_join">sort merge</a> algorithm. Because it requires sorting tables, it runs in <em>O</em>(<em>n</em> log <em>n</em>). (If your tables are already sorted in the correct order, sort merge is automatically the best choice.)</li>
<li>For in-memory tables, <a href="http://en.wikipedia.org/wiki/Hash_join">hash joins</a> are preferable because they run in linear time <em>O</em>(<em>n</em>). However, the characteristics of modern hardware are increasing detrimental to the hash join alternative (see C. Kim, et al. <a href="http://www.vldb.org/pvldb/2/vldb09-257.pdf">Sort vs. Hash revisited</a>. 2009).</li>
</ul>
<p>(It is also possible to use <a href="http://en.wikipedia.org/wiki/Bitmap_index">bitmap indexes</a> to precompute joins.) In any case, short of precomputing the joins, joining large tables is expensive and requires source tables to be consistent.</p>
<p><strong>Conclusion:</strong> SQL is a fine language, but it has some biases that may trap developers. What works well in a business transaction system, may fail you in other instances.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=AP69y9-lxik:VCbufZ0RLW8:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=AP69y9-lxik:VCbufZ0RLW8:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/AP69y9-lxik" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/06/28/nosql-or-nojoin/feed/</wfw:commentRss>
		<slash:comments>14</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/06/28/nosql-or-nojoin/</feedburner:origLink></item>
		<item>
		<title>The fallacy of absolute numbers</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/nDVbo2teE7o/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/06/18/the-fallacy-of-absolute-numbers/#comments</comments>
		<pubDate>Fri, 18 Jun 2010 18:11:29 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2608</guid>
		<description>I often come across the following type of arguments in research papers: You could save 3 bits of storage for every value in your database. Surely that&amp;#8217;s irrelevant. Nobody cares about saving 3 bits! You can sort arrays in 10 ms. Surely, that cannot be improved upon? You are already down to 10 ms and [...]</description>
			<content:encoded><![CDATA[<p>I often come across the following type of arguments in research papers:</p>
<ul>
<li>You could save 3 bits of storage for every value in your database. Surely that&#8217;s irrelevant. Nobody cares about saving 3 bits!</li>
<li>You can sort arrays in 10 ms. Surely, that cannot be improved upon? You are already down to 10 ms and nobody cares about such small delays.</li>
</ul>
<p>I hope you can see what is wrong with these statements?</p>
<p>I call it the <strong>fallacy of absolute numbers:</strong> you express a measure or a gain in absolute value, and then conclude to optimality or near optimality because the number appears small (or large).</p>
<p><strong>Remember:</strong> Saving 3 bits of storage out of 6 bits is a 2:1 compression ratio. Sorting in 5 ms instead of 10 ms doubles the speed.</p>
<p><strong>Disclaimer:</strong> I am sure that someone else has documented this fallacy, but I could not find any reference to it.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=nDVbo2teE7o:VESx0Z5x9HM:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=nDVbo2teE7o:VESx0Z5x9HM:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/nDVbo2teE7o" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/06/18/the-fallacy-of-absolute-numbers/feed/</wfw:commentRss>
		<slash:comments>5</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/06/18/the-fallacy-of-absolute-numbers/</feedburner:origLink></item>
		<item>
		<title>Indexing XML</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/ltONA9z5S2g/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/06/16/indexing-xml/#comments</comments>
		<pubDate>Wed, 16 Jun 2010 14:02:21 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Data Warehousing and OLAP]]></category>
		<category><![CDATA[Science and Technology]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2604</guid>
		<description>I&amp;#8217;d like to know a lot more about XML indexing—if only because I really ought to be teaching this topic. So I decided to write a blog post to expose what I know, hoping that some knowledgeable readers will fill me in on what I am missing. Mostly, I expect we are interested in indexing [...]</description>
			<content:encoded><![CDATA[<p>I&#8217;d like to know a lot more about XML indexing—if only because I really ought to be teaching this topic. So I decided to write a blog post to expose what I know, hoping that some knowledgeable readers will fill me in on what I am missing.</p>
<p>Mostly, I expect we are interested in indexing <a href="http://en.wikipedia.org/wiki/XPath">XPath</a> queries. Not only is XPath useful on its own, but it is also the basis for the <a href="http://en.wikipedia.org/wiki/FLWOR">FLWOR</a> expressions in <a href="http://en.wikipedia.org/wiki/XQuery">XQuery</a>.</p>
<p>A typical XPath expression will select only a small fraction of any XML document (such as the value of a particular attribute). Thus, a sensible strategy is to represent the XML documents as tables. There are several possible maps from XML documents to tables. One of the most common  is ORDPATH.</p>
<p>In the ORDPATH model, the root node receives the identifier 1, the first node contained in the root node receives the identifier 1.1, the second one receives the identifier 1.2, and so on. Given the ORDPATH identifiers, we can easily determine whether two nodes are neighbors, or whether they have a child-parent relationship.</p>
<p>As an example, here&#8217;s an XML document and its (simplified) ORDPATH representation:</p>
<p><code><br />
&lt;liste temps="janvier" &gt;<br />
&lt;bateau /&gt;<br />
&lt;bateau &gt;<br />
&lt;canard /&gt;<br />
&lt;/bateau&gt;<br />
&lt;/liste&gt;<br />
</code></p>
<table border="1">
<tbody>
<tr>
<th>ORDPATH</th>
<th>name</th>
<th>type</th>
<th>value</th>
</tr>
<tr>
<td>1</td>
<td>liste</td>
<td>element</td>
<td>-</td>
</tr>
<tr>
<td>1.1</td>
<td>temps</td>
<td>attribute</td>
<td>janvier</td>
</tr>
<tr>
<td>1.2</td>
<td>bateau</td>
<td>element</td>
<td>-</td>
</tr>
<tr>
<td>1.3</td>
<td>bateau</td>
<td>element</td>
<td>-</td>
</tr>
<tr>
<td>1.3.1</td>
<td>canard</td>
<td>element</td>
<td>-</td>
</tr>
</tbody>
</table>
<p>Given a table, we can easily index it using standard indexes such as B trees or hash tables. For example, if we index the value column, we can quickly process the XPath expression  @temps=&#8221;janvier&#8221;.</p>
<p>Effectively, we can map XPath and XQuery queries into SQL. This leaves relatively little room for XML-specific indexes. I am certain that XML database designers have even smarter strategies, but do they work significantly better?</p>
<p><strong>Reference</strong>: P. O’Neil, et al.. <a href="http://www.cs.umb.edu/~poneil/ordpath.pdf">ORDPATHs: insert-friendly XML node labels</a>. 2004.</p>
<p><strong>Further reading</strong>: <a href="http://www.daniel-lemire.com/blog/archives/2008/12/04/native-xml-databases-have-they-taken-the-world-over-yet/">Native XML databases: have they taken the world over yet?</a></p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=ltONA9z5S2g:shdE_vkrYcE:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=ltONA9z5S2g:shdE_vkrYcE:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/ltONA9z5S2g" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/06/16/indexing-xml/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/06/16/indexing-xml/</feedburner:origLink></item>
		<item>
		<title>Lack of steady trajectories and failure</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/_l6qi9ppnt4/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/06/14/lack-of-steady-trajectories-and-failure/#comments</comments>
		<pubDate>Mon, 14 Jun 2010 16:48:03 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2598</guid>
		<description>A common advice given out to young researchers is to find a niche. (See Michael&amp;#8217;s Branding Your Research). That is certainly good advice. Instead of being another young researcher, you can be the new guy working on topic X. But it always seems to happen no matter what: most Ph.D. thesis address a narrow topic. I [...]</description>
			<content:encoded><![CDATA[<p>A common advice given out to young researchers is to find a niche. (See Michael&#8217;s <a href="http://mybiasedcoin.blogspot.com/2010/06/branding-your-research-and-yourself.html">Branding Your Research</a>). That is certainly good advice. Instead of being another young researcher, you can be the new guy working on topic X. But it always seems to happen no matter what: most Ph.D. thesis address a narrow topic. I believe that the real advice people would like to give is: find yourself a nice topic, and make sure this topic becomes <strong>fashionable</strong>. Of course, this implies that you can somehow predict the future, or have a thesis supervisor with enough clout that he can either initiate new trends, or have inside knowledge regarding the upcoming trends.</p>
<p>A more interesting question is what you should do with the rest of your career, assuming you landed a research job, somehow. Should you find yourself one or two niche topics and stay there for the rest of your life? That is a common strategy. You save precious time: instead of having to skim 100 research articles a year, you may get by with 20 or 30 research articles, or even less. Moreover, because you are the leading authority on one or two topics, you can never be caught unaware. You never have to worry about finding new topics: you just keep on iteratively improving whatever you are doing right now. With some luck, you can reuse your funding proposals year after year. Finally, you can quickly get to know everyone that matters regarding these narrow topics. And that is a perfectly good strategy.</p>
<p>The problems begin when we associate <strong>the lack of a steady trajectory with failure</strong>. <strong>Encouraging static research topics leads to conservatism.</strong> Meanwhile, some of the most innovative researchers have cultivated varied interests. Von Neumann was a set theorist, but <a href="http://stepanov.lk.net/mnemo/legende.html">he wrote 20 papers in Physics</a>, and even in Mathematics, he covered a wide range of topics (set theory, logic, topological groups, measure theory, ergodic theory, operator theory, and continuous geometry). Would we have been better off had von Neumann remained a pure set theorist?</p>
<p>And I tend to have more trust in researchers who have their eggs in different baskets. They can afford to be a bit more critical.</p>
<p><strong>Warning:</strong> I am not urging Ph.D. students to change topic repeatedly while writing up their thesis. Finish whatever you start. And be aware that approaching a new research topic can be costly.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=_l6qi9ppnt4:rX1ux8Xw0ac:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=_l6qi9ppnt4:rX1ux8Xw0ac:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/_l6qi9ppnt4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/06/14/lack-of-steady-trajectories-and-failure/feed/</wfw:commentRss>
		<slash:comments>10</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/06/14/lack-of-steady-trajectories-and-failure/</feedburner:origLink></item>
		<item>
		<title>Academic publishing is archaic</title>
		<link>http://feedproxy.google.com/~r/daniel-lemire/atom/~3/5hLG2IFp0mI/</link>
		<comments>http://www.daniel-lemire.com/blog/archives/2010/06/10/academic-publishing-is-archaic/#comments</comments>
		<pubDate>Thu, 10 Jun 2010 13:57:35 +0000</pubDate>
		<dc:creator>Daniel Lemire</dc:creator>
				<category><![CDATA[Academia/Research]]></category>

		<guid isPermaLink="false">http://www.daniel-lemire.com/blog/?p=2593</guid>
		<description>Technological progress tends to increase the available information. Thus, our capacity to manage this information becomes overloaded (hence the term information overload). As Clay Shirky explained: it is not so much an information overload, as a filter failure. The abundance of information is never a problem. The real problem is the lack of efficient strategies [...]</description>
			<content:encoded><![CDATA[<p>Technological progress tends to increase the available information. Thus, our capacity to manage this information becomes overloaded (hence the term <a href="http://en.wikipedia.org/wiki/Information_overload">information overload</a>). As Clay Shirky <a href="http://web2expo.blip.tv/file/1277460/">explained</a>: it is not so much an information overload, as a filter failure. The abundance of information is never a problem. The real problem is the lack of efficient strategies to index, summarize, filter, cross-reference and archive information.</p>
<p>But information overload is nothing new. In <a href="http://muse.jhu.edu/journals/journal_of_the_history_of_ideas/v064/64.1blair.html">Reading Strategies for Coping With Information Overload ca. 1550-1700</a>, Blair surveys the techniques our ancestors invented to cope with the abundance of books :</p>
<ul>
<li>the alphabetical index;</li>
<li>the reference book,</li>
<li>copy and paste (with actual scissors) to save time in note-taking.</li>
</ul>
<p>What I find fascinating is the historical perspective: while still useful, the alphabetical index is hardly exciting anymore. It has been supplanted by full text search (in e-books). There are still reference books (such as dictionaries), but they are being replaced with online tools. Information overload continues to generate many inventions: the search engine (such as Google), the recommender system (as on Amazon.com), and the social networks (such as Twitter). Literally, these tools expand our minds. We become smarter.</p>
<p>Yet, every time I finish writing a research article, I am amazed at how old fashioned the format is.</p>
<ul>
<li>Research journals still ask for silly metadata such as keywords, even though most researchers rely on full text search.</li>
<li>The format is clearly meant for paper, even though most of my collaborators browse research articles on their computers.</li>
<li>We have silly things like page limitations.</li>
<li>It is excessively difficult to correct or improve a &#8220;published&#8221; article.</li>
</ul>
<p>There is hope. The <a href="http://www.plosone.org/article/info:doi/10.1371/journal.pone.0010663">PLoS One journal</a> presents research articles in an innovative format. The article is interactive: anyone can rate and comment it. Many journals allow the authors to upload supplementary material. Yet, I predict that in 20 years, we will look back and think that academic publishing in 2010 was archaic. (I admit that it is not a daring prediction.) There is much room for innovation.</p>
<p><strong>Source:</strong> <a href="http://erikduval.wordpress.com/about/">Erik Duval</a>.</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/daniel-lemire/atom?a=5hLG2IFp0mI:0EZeNXUr70k:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/daniel-lemire/atom?i=5hLG2IFp0mI:0EZeNXUr70k:D7DqB2pKExk" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/daniel-lemire/atom/~4/5hLG2IFp0mI" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://www.daniel-lemire.com/blog/archives/2010/06/10/academic-publishing-is-archaic/feed/</wfw:commentRss>
		<slash:comments>21</slash:comments>
		<feedburner:origLink>http://www.daniel-lemire.com/blog/archives/2010/06/10/academic-publishing-is-archaic/</feedburner:origLink></item>
	</channel>
</rss>
