<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" version="2.0">

<channel>
	<title>Grant's Grunts: Lucene Edition</title>
	
	<link>http://lucene.grantingersoll.com</link>
	<description>Thoughts on Apache Lucene, Mahout, Solr, Tika and Nutch</description>
	<lastBuildDate>Mon, 06 Feb 2012 12:07:52 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/GrantLucene" /><feedburner:info uri="grantlucene" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><item>
		<title>Looking for a Research Engineer</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/fRDjtp7mSi4/</link>
		<comments>http://lucene.grantingersoll.com/2012/02/06/looking-for-a-research-engineer/#comments</comments>
		<pubDate>Mon, 06 Feb 2012 12:07:52 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Lucene]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=470</guid>
		<description><![CDATA[I&#8217;m looking for a Research Engineer with Hadoop and Solr experience to work on next generation search and big data problems.  If you are interested or know someone who is, please take a look at Careers &#8211; Research Engineer &#124; Lucid Imagination.]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m looking for a Research Engineer with Hadoop and Solr experience to work on next generation search and big data problems.  If you are interested or know someone who is, please take a look at <a href="http://www.lucidimagination.com/about/careers/research-engineer">Careers &#8211; Research Engineer | Lucid Imagination</a>.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/fxMHcpcLKprvwChIezx6JEMXZ_Q/0/da"><img src="http://feedads.g.doubleclick.net/~a/fxMHcpcLKprvwChIezx6JEMXZ_Q/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/fxMHcpcLKprvwChIezx6JEMXZ_Q/1/da"><img src="http://feedads.g.doubleclick.net/~a/fxMHcpcLKprvwChIezx6JEMXZ_Q/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/fRDjtp7mSi4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2012/02/06/looking-for-a-research-engineer/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2012/02/06/looking-for-a-research-engineer/</feedburner:origLink></item>
		<item>
		<title>Berlin Buzzwords 2012</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/kBaQPPHjQW0/</link>
		<comments>http://lucene.grantingersoll.com/2012/01/18/berlin-buzzwords-2012/#comments</comments>
		<pubDate>Wed, 18 Jan 2012 13:33:40 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Lucene]]></category>
		<category><![CDATA[Mahout]]></category>
		<category><![CDATA[Solr]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=467</guid>
		<description><![CDATA[In case you haven&#8217;t heard, and are in Europe this June (or want to be), you should check out the Berlin Buzzwords conference.  It&#8217;s a great conference for all things related to Lucene, Solr, Hadoop, Mahout, NoSQL and generally scaling.  The CFP is open now through March 11.]]></description>
			<content:encoded><![CDATA[<p>In case you haven&#8217;t heard, and are in Europe this June (or want to be), you should check out the <a href="http://www.berlinbuzzwords.de">Berlin Buzzwords</a> conference.  It&#8217;s a great conference for all things related to Lucene, Solr, Hadoop, Mahout, NoSQL and generally scaling.  The CFP is open now through March 11.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/7BZlaLdHQjwcsE2XZwFty2KE00M/0/da"><img src="http://feedads.g.doubleclick.net/~a/7BZlaLdHQjwcsE2XZwFty2KE00M/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/7BZlaLdHQjwcsE2XZwFty2KE00M/1/da"><img src="http://feedads.g.doubleclick.net/~a/7BZlaLdHQjwcsE2XZwFty2KE00M/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/kBaQPPHjQW0" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2012/01/18/berlin-buzzwords-2012/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2012/01/18/berlin-buzzwords-2012/</feedburner:origLink></item>
		<item>
		<title>Taming Text Update</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/8TEjxZz08qM/</link>
		<comments>http://lucene.grantingersoll.com/2011/12/27/taming-text-update/#comments</comments>
		<pubDate>Tue, 27 Dec 2011 13:45:39 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Lucene]]></category>
		<category><![CDATA[OpenNLP]]></category>
		<category><![CDATA[Solr]]></category>
		<category><![CDATA[Taming Text]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=460</guid>
		<description><![CDATA[Drew, Tom and I are feverishly working away on finishing up Taming Text.  We are currently in the process of addressing the feedback we got from our final review and should have updates up soon.  I have also posted all of the book&#8217;s source code up on Github under the Taming Text user.  The source includes, [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignleft" title="Taming Text book cover" src="http://manning.com/ingersoll/ingersoll_cover150.jpg" alt="" width="150" height="188" /></p>
<p>Drew, Tom and I are feverishly working away on finishing up <a href="http://www.manning.com/affiliate/idevaffiliate.php?id=1069_148">Taming Text</a>.  We are currently in the process of addressing the feedback we got from our final review and should have updates up soon.  I have also posted all of the book&#8217;s source code up on Github under the <a href="http://www.github.com/tamingtext">Taming Text user</a>.  The source includes, amongst other things, a simple Question Answering system using Solr and OpenNLP, as well as analyzers for Lucene that use OpenNLP for sentence detection, part of speech tagging and Named Entity Recognition.  As with most books, these examples are meant to be just that, examples.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/WCum_h39VqAkRC65ovkRUL534DI/0/da"><img src="http://feedads.g.doubleclick.net/~a/WCum_h39VqAkRC65ovkRUL534DI/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/WCum_h39VqAkRC65ovkRUL534DI/1/da"><img src="http://feedads.g.doubleclick.net/~a/WCum_h39VqAkRC65ovkRUL534DI/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/8TEjxZz08qM" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/12/27/taming-text-update/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/12/27/taming-text-update/</feedburner:origLink></item>
		<item>
		<title>Mahout in Action Review</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/Hp9Ilq-J7wE/</link>
		<comments>http://lucene.grantingersoll.com/2011/10/15/mahout-in-action-review/#comments</comments>
		<pubDate>Sat, 15 Oct 2011 16:58:47 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Mahout]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=458</guid>
		<description><![CDATA[&#160; &#160; &#160; I&#8217;ve posted my review of &#8220;Mahout in Action&#8221; on Lucid&#8217;s website: Mahout in Action Review.]]></description>
			<content:encoded><![CDATA[<p>&nbsp;</p>
<p>&nbsp;</p>
<p>&nbsp;</p>
<p>I&#8217;ve posted my review of &#8220;<a href="http://www.manning.com/affiliate/idevaffiliate.php?id=1069_219">Mahout in Action</a>&#8221; on Lucid&#8217;s website: <a href="http://www.lucidimagination.com/blog/2011/10/15/mahout-in-action-review/">Mahout in Action Review</a>.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/3z9X9ZwI5tishKG6pSseBlm1PxA/0/da"><img src="http://feedads.g.doubleclick.net/~a/3z9X9ZwI5tishKG6pSseBlm1PxA/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/3z9X9ZwI5tishKG6pSseBlm1PxA/1/da"><img src="http://feedads.g.doubleclick.net/~a/3z9X9ZwI5tishKG6pSseBlm1PxA/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/Hp9Ilq-J7wE" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/10/15/mahout-in-action-review/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/10/15/mahout-in-action-review/</feedburner:origLink></item>
		<item>
		<title>TriHUG Next Meeting featuring Josh Patterson of Cloudera set for Oct. 11</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/7s_8yMRnhUk/</link>
		<comments>http://lucene.grantingersoll.com/2011/10/07/trihug-next-meeting-featuring-josh-patterson-of-cloudera-set-for-oct-11/#comments</comments>
		<pubDate>Fri, 07 Oct 2011 13:44:14 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Cary]]></category>
		<category><![CDATA[Chapel Hill]]></category>
		<category><![CDATA[Durham]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Raleigh]]></category>
		<category><![CDATA[TriHUG]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=452</guid>
		<description><![CDATA[&#160; &#160; &#160; Just a few more days until the next Triangle Hadoop User&#8217;s Group meeting.  Get the details and sign up via Triangle Hadoop Users Group, TriHUG Next Meeting featuring Josh Patterson of Cloudera set for Oct. 11.]]></description>
			<content:encoded><![CDATA[<p>&nbsp;</p>
<p>&nbsp;</p>
<p>&nbsp;</p>
<p>Just a few more days until the next Triangle Hadoop User&#8217;s Group meeting.  Get the details and sign up via <a href="http://www.trihug.org/post/10200106608/trihug-next-meeting-featuring-josh-patterson-of">Triangle Hadoop Users Group, TriHUG Next Meeting featuring Josh Patterson of Cloudera set for Oct. 11</a>.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/QR-QFS_7iyB8xS90Mk57daY5AdA/0/da"><img src="http://feedads.g.doubleclick.net/~a/QR-QFS_7iyB8xS90Mk57daY5AdA/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/QR-QFS_7iyB8xS90Mk57daY5AdA/1/da"><img src="http://feedads.g.doubleclick.net/~a/QR-QFS_7iyB8xS90Mk57daY5AdA/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/7s_8yMRnhUk" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/10/07/trihug-next-meeting-featuring-josh-patterson-of-cloudera-set-for-oct-11/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/10/07/trihug-next-meeting-featuring-josh-patterson-of-cloudera-set-for-oct-11/</feedburner:origLink></item>
		<item>
		<title>Lucid Imagination » Flexible ranking in Lucene 4</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/NUzDu5bCMqM/</link>
		<comments>http://lucene.grantingersoll.com/2011/09/12/lucid-imagination-%c2%bb-flexible-ranking-in-lucene-4/#comments</comments>
		<pubDate>Mon, 12 Sep 2011 21:28:15 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Lucene]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=450</guid>
		<description><![CDATA[For those who have wanted other scoring models in Lucene/Solr (Okapi, others) more details can be found on Lucid&#8217;s blog: Lucid Imagination » Flexible ranking in Lucene 4.]]></description>
			<content:encoded><![CDATA[<p>For those who have wanted other scoring models in Lucene/Solr (Okapi, others) more details can be found on Lucid&#8217;s blog: <a href="http://www.lucidimagination.com/blog/2011/09/12/flexible-ranking-in-lucene-4/">Lucid Imagination » Flexible ranking in Lucene 4</a>.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/uZfjcIS6EvYKWRcdkxCMMXJ1loc/0/da"><img src="http://feedads.g.doubleclick.net/~a/uZfjcIS6EvYKWRcdkxCMMXJ1loc/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/uZfjcIS6EvYKWRcdkxCMMXJ1loc/1/da"><img src="http://feedads.g.doubleclick.net/~a/uZfjcIS6EvYKWRcdkxCMMXJ1loc/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/NUzDu5bCMqM" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/09/12/lucid-imagination-%c2%bb-flexible-ranking-in-lucene-4/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/09/12/lucid-imagination-%c2%bb-flexible-ranking-in-lucene-4/</feedburner:origLink></item>
		<item>
		<title>R in Action</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/yhaKU5QQK-M/</link>
		<comments>http://lucene.grantingersoll.com/2011/09/02/r-in-action/#comments</comments>
		<pubDate>Fri, 02 Sep 2011 12:12:24 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Lucene]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=448</guid>
		<description><![CDATA[Just ordered &#8220;R in Action&#8221; from Manning.  Looking forward to learning more about it, as it comes up often when discussing solving smaller problems that what is appropriate for Apache Mahout.  Hopefully, I will have time to post a review in the coming weeks.]]></description>
			<content:encoded><![CDATA[<p>Just ordered &#8220;<a href="http://affiliate.manning.com/idevaffiliate.php?id=1069&amp;url=16">R in Action</a>&#8221; from Manning.  Looking forward to learning more about it, as it comes up often when discussing solving smaller problems that what is appropriate for <a href="http://mahout.apache.org">Apache Mahout</a>.  Hopefully, I will have time to post a review in the coming weeks.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/Qjncf1ggoL6wE8X34kXXSVoEako/0/da"><img src="http://feedads.g.doubleclick.net/~a/Qjncf1ggoL6wE8X34kXXSVoEako/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/Qjncf1ggoL6wE8X34kXXSVoEako/1/da"><img src="http://feedads.g.doubleclick.net/~a/Qjncf1ggoL6wE8X34kXXSVoEako/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/yhaKU5QQK-M" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/09/02/r-in-action/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/09/02/r-in-action/</feedburner:origLink></item>
		<item>
		<title>TriHUG Next Meeting: Sept. 13 @ Bronto Software</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/lfpCY2iR3As/</link>
		<comments>http://lucene.grantingersoll.com/2011/08/28/trihug-next-meeting-sept-13-bronto-software/#comments</comments>
		<pubDate>Sun, 28 Aug 2011 20:30:53 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[TriHUG]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=446</guid>
		<description><![CDATA[Triangle Hadoop Users Group, Next Meeting: Sept. 13 @ Bronto Software. Ted Dunning of Mahout fame will be speaking at the next TriHUG meeting on MapR and it&#8217;s relationship with Hadoop, etc.]]></description>
			<content:encoded><![CDATA[<p><a href="http://www.trihug.org/post/9512860582/next-meeting-sept-13-bronto-software">Triangle Hadoop Users Group, Next Meeting: Sept. 13 @ Bronto Software</a>.</p>
<p>Ted Dunning of Mahout fame will be speaking at the next TriHUG meeting on MapR and it&#8217;s relationship with Hadoop, etc.</p>

<p><a href="http://feedads.g.doubleclick.net/~a/EzP4lwIS0-AaAQBDbuPjOd5cHYA/0/da"><img src="http://feedads.g.doubleclick.net/~a/EzP4lwIS0-AaAQBDbuPjOd5cHYA/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/EzP4lwIS0-AaAQBDbuPjOd5cHYA/1/da"><img src="http://feedads.g.doubleclick.net/~a/EzP4lwIS0-AaAQBDbuPjOd5cHYA/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/lfpCY2iR3As" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/08/28/trihug-next-meeting-sept-13-bronto-software/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/08/28/trihug-next-meeting-sept-13-bronto-software/</feedburner:origLink></item>
		<item>
		<title>SXSW 2012 – Apache Mahout: Bringing Intelligence to Your App</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/_Q7ZTpFRHbE/</link>
		<comments>http://lucene.grantingersoll.com/2011/08/15/sxsw-2012-apache-mahout-bringing-intelligence-to-your-app/#comments</comments>
		<pubDate>Mon, 15 Aug 2011 19:43:38 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[machine learning]]></category>
		<category><![CDATA[Mahout]]></category>
		<category><![CDATA[Map Reduce]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=441</guid>
		<description><![CDATA[It&#8217;s that time of year again: time to vote for SXSW talks.  Last year I did a talk with RC Johnson of BazaarVoice on Solr as NoSQL, this year I thought I would try to fly solo and submitted a talk on Apache Mahout. So, if you are so inclined to do the whole crowdsourcing [...]]]></description>
			<content:encoded><![CDATA[<p><img class="alignright" title="SXSW Panel Picker" src="http://panelpicker.sxsw.com/img/sxsw/my_SXSW_idea_2012.png" alt="" width="200" height="120" />It&#8217;s that time of year again: time to vote for <a href="http://www.sxsw.com">SXSW</a> talks.  Last year I did a talk with RC Johnson of <a href="http://www.bazaarvoice.com">BazaarVoice</a> on Solr as NoSQL, this year I thought I would try to fly solo and submitted a talk on <a href="http://mahout.apache.org">Apache Mahout</a>.</p>
<p>So, if you are so inclined to do the whole crowdsourcing thing, please go vote for my talk at <a href="http://panelpicker.sxsw.com/ideas/view/9001">SXSW 2012 &#8211; Apache Mahout: Bringing Intelligence to Your App</a> and then maybe I will see you at SXSW in 2012.</p>
<p>&nbsp;</p>

<p><a href="http://feedads.g.doubleclick.net/~a/whKITFkXH5GeiR-3UKcWDs9pfoE/0/da"><img src="http://feedads.g.doubleclick.net/~a/whKITFkXH5GeiR-3UKcWDs9pfoE/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/whKITFkXH5GeiR-3UKcWDs9pfoE/1/da"><img src="http://feedads.g.doubleclick.net/~a/whKITFkXH5GeiR-3UKcWDs9pfoE/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/_Q7ZTpFRHbE" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/08/15/sxsw-2012-apache-mahout-bringing-intelligence-to-your-app/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/08/15/sxsw-2012-apache-mahout-bringing-intelligence-to-your-app/</feedburner:origLink></item>
		<item>
		<title>Mahout and Other News</title>
		<link>http://feedproxy.google.com/~r/GrantLucene/~3/0Z2yY6Flmwg/</link>
		<comments>http://lucene.grantingersoll.com/2011/08/05/mahout-and-other-news/#comments</comments>
		<pubDate>Fri, 05 Aug 2011 20:41:35 +0000</pubDate>
		<dc:creator>grant_ingersoll</dc:creator>
				<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Lucene]]></category>
		<category><![CDATA[Mahout]]></category>
		<category><![CDATA[Solr]]></category>

		<guid isPermaLink="false">http://lucene.grantingersoll.com/?p=438</guid>
		<description><![CDATA[After some time away, I&#8217;m happy to have had some time recently to work on Mahout again.  Lots of goodness all over the place happening there that I&#8217;ll leave to others to explain while I focus in on a few recent things I&#8217;ve been doing. First off, I was doing a fair amount of work [...]]]></description>
			<content:encoded><![CDATA[<p>After some time away, I&#8217;m happy to have had some time recently to work on Mahout again.  Lots of goodness all over the place happening there that I&#8217;ll leave to others to explain while I focus in on a few recent things I&#8217;ve been doing.</p>
<p>First off, I was doing a fair amount of work calculating document similarities across whole collections using, at first, the RowSimilarityJob and later a map-side simplification I wrote that uses the distributed cache called the VectorDistanceSimilarityJob.  Both of these come in handy when one wants to calculate pairwise-similarity between all (or most) items in a collection.  The original Mahout implementation was focused on providing recommendations, but as outlined in the <a href="http://www.umiacs.umd.edu/~jimmylin/publications/Elsayed_etal_ACL2008_short.pdf">Elsayed, Lin and Oard paper</a>, it is quite useful for text as well in cases where one wants to precompute &#8220;more like this&#8221; for all documents.  As for the need for two similar approaches, see the discussion at <a href="http://www.lucidimagination.com/search/document/40c4f124795c6b5/rowsimilarity_s#42ab816c27c6a9e7">http://www.lucidimagination.com/search/document/40c4f124795c6b5/rowsimilarity_s#42ab816c27c6a9e7</a>.  In essence, it boils down to I didn&#8217;t need a fully generic implementation that was a bit slower on larger matrices since I mainly wanted to compare all my vectors in HDFS against a subset of &#8220;core&#8221; vectors that fit into memory.  That being said, <a href="http://ssc.io/rowsimilarityjob-on-steroids/">Sebastian</a> is already hard at work on making the more generic version perform better when certain distance measures are used while still offering the full suite of capabilities of the existing RowSimilarityJob.  See <a href="https://issues.apache.org/jira/browse/MAHOUT-767">MAHOUT-767</a> for more info on that work.</p>
<p>Now, I&#8217;m looking into some more pruning techniques via <a href="https://issues.apache.org/jira/browse/MAHOUT-688">MAHOUT-688</a>.  After that quick patch, I think I&#8217;m going to dig in a bit more to recommendations as well as run some tests on the ASF mail archives I posted a while back (see below for an update).</p>
<p>Also, I&#8217;ve switched to using Git and Github for managing my Mahout changes (as well as other work), so if you want to see what I&#8217;m up to, <a href="https://github.com/gsingers/">check out my Github</a> account.</p>
<p>It&#8217;s not complete yet, but the ASF Public Mail archive I put up <a href="https://s3.amazonaws.com/asf-mail-archives/index.html">last September</a> on Amazon AWS is getting a fresh new version.  The interim solution is available at <a href="https://s3.amazonaws.com/asf-mail-archives-7-18-2011/index.html">https://s3.amazonaws.com/asf-mail-archives-7-18-2011/index.html</a>, but look for it to be a <a href="http://aws.amazon.com/datasets">Public Data Set</a> hosted by Amazon soon.  The September version of this data contained roughly 6.7M emails sent to the public mailing lists at the Apache Software Foundation, so I suspect this version has somewhere in the 7M+ item range, but I haven&#8217;t counted them.  At any rate, I hope it is useful to people.</p>
<p>Finally, on a personal note, I&#8217;m back at <a href="http://www.lucidimagination.com">Lucid Imagination</a> after a brief move elsewhere, this time in a new role as Chief Scientist.  Lucid is a company I co-founded and helped build up for the past 4 years.  I&#8217;m looking forward to be back working closely with Lucene and Solr again and a <a href="http://www.lucidimagination.com/why-lucid/leadership">top notch technical team</a>.  I&#8217;m also looking forward to working on Mahout more, as well as other technologies like Hadoop, Pig, HBase and the like, especially as they relate to search and recommendations.</p>
<p>&nbsp;</p>

<p><a href="http://feedads.g.doubleclick.net/~a/sOBl1R7GosNQOkQjZ93V7IHlIeo/0/da"><img src="http://feedads.g.doubleclick.net/~a/sOBl1R7GosNQOkQjZ93V7IHlIeo/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/sOBl1R7GosNQOkQjZ93V7IHlIeo/1/da"><img src="http://feedads.g.doubleclick.net/~a/sOBl1R7GosNQOkQjZ93V7IHlIeo/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/GrantLucene/~4/0Z2yY6Flmwg" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://lucene.grantingersoll.com/2011/08/05/mahout-and-other-news/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://lucene.grantingersoll.com/2011/08/05/mahout-and-other-news/</feedburner:origLink></item>
	</channel>
</rss>

