<?xml version='1.0' encoding='UTF-8'?><rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" version="2.0"><channel><atom:id>tag:blogger.com,1999:blog-818850914311213273</atom:id><lastBuildDate>Sun, 07 Jun 2026 13:23:43 +0000</lastBuildDate><category>java</category><category>hadoop</category><category>web 2.0</category><category>REST</category><category>programming</category><category>internet</category><category>opensocial</category><category>database</category><category>distributed systems</category><category>javascript</category><category>open source</category><category>algorithms</category><category>cloud computing</category><category>semantic</category><category>web</category><category>webservice</category><category>apache</category><category>architecture</category><category>big data</category><category>codes</category><category>experiments</category><category>groovy</category><category>json</category><category>ria</category><category>API</category><category>adobe air</category><category>ajax</category><category>comet</category><category>data access</category><category>data integration</category><category>designpatterns</category><category>eclipse</category><category>etl</category><category>graph</category><category>hibernate</category><category>hive</category><category>indexing</category><category>information</category><category>linux</category><category>lucene</category><category>map reduce</category><category>nosql</category><category>scala</category><category>security</category><category>soa</category><category>spring</category><category>technical document</category><category>technology</category><category>ui</category><category>web 3.0</category><category>widget</category><category>.net</category><category>ASP.Net</category><category>BPM</category><category>GWT</category><category>JCR</category><category>JVM</category><category>OpenID</category><category>RCP</category><category>RPC</category><category>WCF</category><category>XML</category><category>ambient</category><category>aop</category><category>avro</category><category>bigdata</category><category>bloomfilter</category><category>business</category><category>closures</category><category>cybernetics</category><category>e-commerce</category><category>eii</category><category>event processing</category><category>extjs</category><category>future computing</category><category>gadgets</category><category>geo-spatial</category><category>github</category><category>google</category><category>griffon</category><category>guice</category><category>hbase</category><category>ilm</category><category>information graphics</category><category>intelligence</category><category>invisible</category><category>iphone</category><category>jBPM</category><category>jibx</category><category>jquery</category><category>networks</category><category>oAuth</category><category>objectiveC</category><category>open solaris</category><category>orient-db</category><category>orientdb</category><category>orm</category><category>osgi</category><category>parallel</category><category>patterns</category><category>pramati</category><category>prometheus</category><category>protocol buffers</category><category>rdf</category><category>recommendation</category><category>ruby</category><category>silverlight</category><category>sling</category><category>social</category><category>spark</category><category>standards</category><category>storage</category><category>tech</category><category>ted</category><category>twitter</category><category>unicode</category><category>ux</category><category>zeppelin</category><title>#bytescrolls</title><description></description><link>http://bytescrolls.blogspot.com/</link><managingEditor>noreply@blogger.com (Unknown)</managingEditor><generator>Blogger</generator><openSearch:totalResults>90</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-1614526892468295106</guid><pubDate>Wed, 26 Apr 2017 21:29:00 +0000</pubDate><atom:updated>2017-04-27T02:59:07.450+05:30</atom:updated><title>Run Datastax Graph in a docker container for windows hosts </title><description>#docker #datastax&lt;br /&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;span style=&quot;text-align: justify;&quot;&gt;There are some docker images available &lt;/span&gt;&lt;a href=&quot;https://hub.docker.com/r/luketillman/datastax-enterprise/tags/&quot; style=&quot;text-align: justify;&quot; target=&quot;_blank&quot;&gt;here&lt;/a&gt;&lt;span style=&quot;text-align: justify;&quot;&gt;.&amp;nbsp;If you don&#39;t have docker, download and install docker toolbox from &lt;/span&gt;&lt;a href=&quot;https://www.docker.com/products/docker-toolbox&quot; style=&quot;text-align: justify;&quot; target=&quot;_blank&quot;&gt;here&lt;/a&gt;&lt;span style=&quot;text-align: justify;&quot;&gt; for your system. If you are using a windows enterprise version, &amp;nbsp;&lt;a href=&quot;https://docs.docker.com/docker-for-windows/install/#download-docker-for-windows&quot; target=&quot;_blank&quot;&gt;docker on windows&lt;/a&gt; may not be supported on older versions. So you may have to upgrade and use Windows native virtualisation&amp;nbsp;Hyper V. For docker toolbox, you have to rely on others like virtualbox. &amp;nbsp;Also, note that if you have enabled Windows HyperV, you may have to disable it for docker toolbox to install. On the elevated command line you may execute the following to disable the HyperV (only for docker toolbox).&lt;/span&gt;&lt;/div&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
dism.exe /Online /Disable-Feature:Microsoft-Hyper-V-All&lt;/blockquote&gt;
After installing docker in your windows box run this from the docker bash,&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
docker run --name my-dse -d -p 9042:9042 luketillman/datastax-enterprise:5.1.0 -g&lt;/blockquote&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This will download the image from the docker hub. Thanks to &lt;a href=&quot;https://github.com/LukeTillman&quot; target=&quot;_blank&quot;&gt;Luke Tillman&lt;/a&gt; from Datastax. For your java/scala code or client or Dev Studio to work make sure that port 9042 is exposed. Also, the -g option will run the graph. To bash into gremlin or the container, execute the following,&lt;/div&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
docker exec -it my-dse bash&lt;/blockquote&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
You can login to server and access gremlin from /usr/bin/dse gremlin-console and create graphs etc. Note that this image supports only single node. To connect to dse node from windows host, you have to fetch the ip of the container, so run:&lt;/div&gt;
&lt;blockquote class=&quot;tr_bq&quot; style=&quot;text-align: justify;&quot;&gt;
&amp;nbsp;docker-machine env default&lt;/blockquote&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The value of DOCKER_HOST can be used by the client code to connect to cassandra graph as the endpoint.&amp;nbsp;&lt;/div&gt;
&lt;br /&gt;
More can be read from &lt;a href=&quot;https://github.com/LukeTillman/dse-docker/blob/master/README.md&quot; target=&quot;_blank&quot;&gt;here&lt;/a&gt;.</description><link>http://bytescrolls.blogspot.com/2017/04/run-datastax-graph-in-docker-container.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-4505445523623941352</guid><pubDate>Fri, 15 Apr 2016 10:31:00 +0000</pubDate><atom:updated>2016-04-15T16:56:16.909+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">e-commerce</category><category domain="http://www.blogger.com/atom/ns#">etl</category><category domain="http://www.blogger.com/atom/ns#">graph</category><category domain="http://www.blogger.com/atom/ns#">orientdb</category><category domain="http://www.blogger.com/atom/ns#">recommendation</category><title>A simple recommender system for your e-commerce store using a graph database</title><description>#graph #recommendation #orientdb #e-commerce #etl&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
In the last post, I have introduced you to a simple ETL use case for graph database like Orient DB. If you haven’t read it, I suggest you read this - &lt;a href=&quot;http://bytescrolls.blogspot.in/2016/04/orient-db-simple-etl-use-case-note.html&quot;&gt;OrientDB A simple use case note&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
After loading data, you might want to play around with the graph structure and its possible traversal logic. As it is easy to represent the semantic relationships between them, the queries we will write also be designed based on the logic we come up with. In the last post, I have provided the query to find out the books bought by the buyers he know or befriended. In this post, I will provide some more simple examples to query such a graph in Orient DB. Here I a using the native query supported by the database.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;How do we find out the books bought by a buyer named ‘Hary’?&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select @rid, title from (select expand(out(&#39;Bought&#39;)) from Buyer where name=&#39;Hary&#39;)
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgu5-9qTujZ7O1SKvaPzNiMApViKYCF9X4lBdIlQG7Cf-K9MeL8qW3_vTLExTYvjanLCRsdLDEzXJcUtsCoqxA-5Z8yDJLmErU_KgJ3f7lRFfunAjzg1A5_1Po9rjp-SX5Pg5CTq2wbs5s/s1600/Screen+Shot+2016-04-15+at+3.53.30+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;224&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgu5-9qTujZ7O1SKvaPzNiMApViKYCF9X4lBdIlQG7Cf-K9MeL8qW3_vTLExTYvjanLCRsdLDEzXJcUtsCoqxA-5Z8yDJLmErU_KgJ3f7lRFfunAjzg1A5_1Po9rjp-SX5Pg5CTq2wbs5s/s640/Screen+Shot+2016-04-15+at+3.53.30+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Here this query will return the RecordId of the format &amp;lt;&amp;lt;cluster: position&amp;gt;&amp;gt; form. In OrientDB each record has its own self-assigned unique ID within the database called Record ID or RID. cluster-id is the id of the cluster and cluster-position is the position of the record inside the cluster. You can consider a cluster as a Table where each class (say, Buyer) of records are stored. Here the subquery uses expand function to expand the collection in the field and use it as result. It will fetch the records linked to the edge ‘Bought’.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;How do we find out the people ‘Hary’ knows?&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select expand(out(&#39;Knows&#39;)) from  Buyer where name=&#39;Hary&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjSnSpry7aqkiuc6YXaaKyWfulWMQuzi4Ti3m8bZ_VqyJQ5ejBERBOAIjTYYPBNItLbjLhsVmdd0at-CaGiXSr_worU0wEzhEOxMbDG_zVnsKiB1nfNGInIju6OyFiduH5gmKdh9XHbgEM/s1600/Screen+Shot+2016-04-15+at+3.52.44+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;238&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjSnSpry7aqkiuc6YXaaKyWfulWMQuzi4Ti3m8bZ_VqyJQ5ejBERBOAIjTYYPBNItLbjLhsVmdd0at-CaGiXSr_worU0wEzhEOxMbDG_zVnsKiB1nfNGInIju6OyFiduH5gmKdh9XHbgEM/s640/Screen+Shot+2016-04-15+at+3.52.44+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Find out the books bought by friends of Hary?&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select title from (
select expand(out(&#39;Bought&#39;)) from (select expand(out(&#39;Knows&#39;)
) from  Buyer where name=&#39;Hary&#39;))
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjwiJ6PgdPrj1OGCFwI6Ws3GHxnlKPsqXOdF-iOsATt7R1e5wp0klKWb9t6EyWMPfqqRW4CaXAt10n0YFPfX5984Wsm5K3rpIHaVe5t6cGG68LUoq1Dh1-UaLt_l7DRLbqkcz2XUvP6hbA/s1600/Screen+Shot+2016-04-15+at+3.51.50+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;240&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjwiJ6PgdPrj1OGCFwI6Ws3GHxnlKPsqXOdF-iOsATt7R1e5wp0klKWb9t6EyWMPfqqRW4CaXAt10n0YFPfX5984Wsm5K3rpIHaVe5t6cGG68LUoq1Dh1-UaLt_l7DRLbqkcz2XUvP6hbA/s640/Screen+Shot+2016-04-15+at+3.51.50+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Here we combined both of the queries above it to make a logical decision as the interlinking of vertices is clearly identified.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Find out books bought by Hary but not  by his friends, so that we can recommend some?&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select title from (select expand(out(&#39;Bought&#39;)) from Buyer where name=&#39;Hary&#39;) 
let $temp = (
  select title from (
    select expand(out(&#39;Bought&#39;)) from (
      select expand(out(&#39;Knows&#39;)) from  Buyer where name=&#39;Hary&#39;
    )
  )
)where title not in $temp
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEivct9T6He_J0HgYXzacq4h8BU98jaY-R4wXRBMOuoESiVZsvaO74__WYNqZn_s040oZmO4xOOf0b-l5S7UeAuc5RL8BTP-KU_VQGe6J1RSy6wl7R2hLAAbcVi5MBU4N4zLHjgeBkpS0RA/s1600/Screen+Shot+2016-04-15+at+3.50.38+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;236&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEivct9T6He_J0HgYXzacq4h8BU98jaY-R4wXRBMOuoESiVZsvaO74__WYNqZn_s040oZmO4xOOf0b-l5S7UeAuc5RL8BTP-KU_VQGe6J1RSy6wl7R2hLAAbcVi5MBU4N4zLHjgeBkpS0RA/s640/Screen+Shot+2016-04-15+at+3.50.38+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Here we used LET to assign the results of a subquery. In the subquery, we find the books bought by Hary’s friends. Then we find the books bought by Hary but not by friends.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Find out the books who also bought a book like The Shining? &lt;/b&gt;This is a common use case for recommend links where we may want to list the similar products bought by people who is about to buy the displayed product.&lt;br /&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select expand(inE(&#39;Bought&#39;).outV().OutE(&#39;Bought&#39;).inV().title) 
from Book where title not in [&#39;The Shining&#39;]
&lt;/code&gt;&lt;/pre&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiOGdwp7KaMWLFy56h8SJEYxGYXmoKbJNHwOYLBVEZa1aCkZh5_1M_8kggVf5dtMKlsZTuCBRhNTeYMTDro8qmsCoMEtCIjpNvm3uP0gnf6NtHQl0KHxCta3oAFiIfdcvwzU1t2pa2pZJk/s1600/Screen+Shot+2016-04-15+at+3.45.52+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;242&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiOGdwp7KaMWLFy56h8SJEYxGYXmoKbJNHwOYLBVEZa1aCkZh5_1M_8kggVf5dtMKlsZTuCBRhNTeYMTDro8qmsCoMEtCIjpNvm3uP0gnf6NtHQl0KHxCta3oAFiIfdcvwzU1t2pa2pZJk/s640/Screen+Shot+2016-04-15+at+3.45.52+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
</description><link>http://bytescrolls.blogspot.com/2016/04/a-simple-recommender-system-for-your-e.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgu5-9qTujZ7O1SKvaPzNiMApViKYCF9X4lBdIlQG7Cf-K9MeL8qW3_vTLExTYvjanLCRsdLDEzXJcUtsCoqxA-5Z8yDJLmErU_KgJ3f7lRFfunAjzg1A5_1Po9rjp-SX5Pg5CTq2wbs5s/s72-c/Screen+Shot+2016-04-15+at+3.53.30+PM.png" height="72" width="72"/><thr:total>1</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-5796893949844149177</guid><pubDate>Tue, 12 Apr 2016 17:11:00 +0000</pubDate><atom:updated>2016-04-12T22:43:59.302+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">database</category><category domain="http://www.blogger.com/atom/ns#">etl</category><category domain="http://www.blogger.com/atom/ns#">graph</category><category domain="http://www.blogger.com/atom/ns#">java</category><category domain="http://www.blogger.com/atom/ns#">orient-db</category><title>Orient DB - A simple ETL use case note</title><description>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
#orientdb #graph #etl #java #database
&lt;br /&gt;
&lt;br /&gt;
As someone who is familiar with graph data structures would like to know how we can map real-world models to a graph and process them. If you are trying to build them programmatically and approach them using traversal algorithms, you are going to have a hard time. If your application use a relational database to store data mapped to these models, then it will become complex while trying to link them with more relationships. How will you design the relationships between domains in a better &amp;nbsp;semantic way? How would you query them like a sql-like or DSL language? Graph databases should be a right candidate. Here I am trying to test out&amp;nbsp;&lt;b&gt;Orient DB&lt;/b&gt;.&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;http://www.silverstripe.org/assets/blog/_resampled/resizedimage20083-OrientdbLogo.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;http://www.silverstripe.org/assets/blog/_resampled/resizedimage20083-OrientdbLogo.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
In relational databases,  we have primary and foreign-key columns references that helps joins that are computed at query time which is memory and compute intensive. Also we use junctions tables for many-to-many relationships with highly normalized tables which will &amp;nbsp;increase the query execution time and complexity. Graph databases are like relational databases, but with first class support for “relationships” defined by edges (stored as list) connected nodes (vertex/entity).  Whenever you run a join operation, the database just uses this materialized list and has direct access to the connected nodes, eliminating the need for a expensive search / match computation.&lt;br /&gt;
&lt;br /&gt;
Consider following tables,&lt;br /&gt;
&lt;br /&gt;
&lt;u&gt;Author Table&lt;/u&gt;&lt;br /&gt;
&lt;table border=&quot;1&quot;&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;id&lt;/th&gt;
&lt;th&gt;name&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;Stephen King&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;George R. R. Martin&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;John Grisham&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;
&lt;u&gt;Book Table&lt;/u&gt;&lt;br /&gt;
&lt;table border=&quot;1&quot;&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;id&lt;/th&gt;
&lt;th&gt;author_id&lt;/th&gt;
&lt;th&gt;title&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;Carrie&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;The Shining&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;
&lt;u&gt;Buyer Table&lt;/u&gt;&lt;br /&gt;
&lt;table border=&quot;1&quot;&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;id&lt;/th&gt;
&lt;th&gt;name&lt;/th&gt;
&lt;th&gt;knows_id&lt;/th&gt;
&lt;th&gt;book_id&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;Hary&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;Mary&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;
&lt;br /&gt;
In graph database like orient db, we can define the relationships in amore semantic way. Graph databases operate on 3 structures: Vertex(sometimes called Node), Edge(or Arc) and Property(sometimes called Attribute).&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;&lt;b&gt;Vertex&lt;/b&gt;. It’s data: Author,  Book etc&lt;/li&gt;
&lt;li&gt;&lt;b&gt;Edge&lt;/b&gt; is physical relation between Vertices. Each Edge connects two different vertices, no more, no less. Additionally Edge has label and Direction, so If you label your edge as likes you know that Hary bought the book The Shining. The direction of relationship cane be either Out or In.&lt;/li&gt;
&lt;li&gt;&lt;b&gt;Property&lt;/b&gt; - it’s a value related to Vertex or Edge.&lt;/li&gt;
&lt;/ul&gt;
OrientDB comes with an &lt;a href=&quot;https://github.com/orientechnologies/orientdb-etl/wiki&quot;&gt;ETL&lt;/a&gt; tool to import data. Also, you can use the libraries and write your own code to create nodes in the database. A generic framework for graph databases is available. More on &lt;a href=&quot;http://tinkerpop.incubator.apache.org/&quot;&gt;Apache TinkerPop&lt;/a&gt; later.&lt;br /&gt;
You have to define configuration files for loading certain data into the graph store.&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/dotsomeone/0fa4bf35d482eb4f89afb8067b8ec835.js&quot;&gt;&lt;/script&gt;

In the above sample configuration, you are defining,&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;“source”: { “file”: { “path”: “csv file location” } } // &lt;em&gt;the source of file input for a model/entity&lt;/em&gt;&lt;/li&gt;
&lt;li&gt;in transformer
&lt;ul&gt;
&lt;li&gt;vertex as the model or table&lt;/li&gt;
&lt;li&gt;edge will define the edges in and out of the table&lt;/li&gt;
&lt;li&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;In the loader definition we define all the entities and constraints&lt;/li&gt;
&lt;/ul&gt;
More about the transformation definition can be read &lt;a href=&quot;http://orientdb.com/docs/last/ETL-Introduction.html&quot;&gt;here&lt;/a&gt;&lt;br /&gt;
Import the csv files and configuration from the &lt;a href=&quot;https://github.com/dotsomeone/orientdb-test/tree/master/samples/bookstore&quot;&gt;github&lt;/a&gt; repo. Please change the location of files and conf with respective to your environment.&lt;br /&gt;
&lt;br /&gt;
Simply execute the &lt;b&gt;oetl.sh&lt;/b&gt; tool from $ORIENTDB_HOME as  sh oetl.sh ‘location of conf file’&lt;br /&gt;
&lt;br /&gt;
&lt;img alt=&quot;loading author data&quot; height=&quot;88&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh_Fo6z2rqGICTsIsE_HgAc6DMC0GkTVEAl5Sd7M_4VQLfQvopK75dJXE26yixnlSV6Qj-BnrMQ8w4V0-dqG6RzPr8I-_zptMuZzArUUpaLWaFTB6uGXBNVBsTTX_Ly74FRUihqpLuvFrY/w995-h138-no/&quot; width=&quot;640&quot; /&gt;&lt;br /&gt;
You have to execute all the configurations to load all the data.&lt;br /&gt;
After loading all the data you can query out and visualize them in the Orient DB’s web based console.&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhpAg-Y3gLK0EyRPM3A3OQG3kFApRa_oOKQPGNYpNjdB_qrH2SjMojxaVaKCZ5-j5Mtt3pGoghDXoe52RAHgTapAyXXBp1zxCYlIfgJhgnr5Avo8JL3cD1184LKXVvBkMh3uQuu5kvDmS4/s1600/Screen+Shot+2016-04-12+at+6.13.15+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;306&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhpAg-Y3gLK0EyRPM3A3OQG3kFApRa_oOKQPGNYpNjdB_qrH2SjMojxaVaKCZ5-j5Mtt3pGoghDXoe52RAHgTapAyXXBp1zxCYlIfgJhgnr5Avo8JL3cD1184LKXVvBkMh3uQuu5kvDmS4/s640/Screen+Shot+2016-04-12+at+6.13.15+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Here you can see the links between the entities.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;how do you find the books bought by your friends?&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;pre&gt;&lt;code&gt;select expand( both(&#39;Knows&#39;).out(&#39;Bought&#39;)) from Buyer where name = &#39;Hary&#39;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgNU4UparVLxTeB2pZA6l1F7B5UsDnhsWNdYI-lZCdYB-vTE0Tf9DLzqQbK3nv6zsuBiftwfFNfcUE79vbwrd3nXBqbHa1gNKuzEHr2lhY-ZJFpneqbGyPUFMBlHbCXKMy_OLRxzzbBk1A/s1600/Screen+Shot+2016-04-12+at+6.02.12+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;238&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgNU4UparVLxTeB2pZA6l1F7B5UsDnhsWNdYI-lZCdYB-vTE0Tf9DLzqQbK3nv6zsuBiftwfFNfcUE79vbwrd3nXBqbHa1gNKuzEHr2lhY-ZJFpneqbGyPUFMBlHbCXKMy_OLRxzzbBk1A/s640/Screen+Shot+2016-04-12+at+6.02.12+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;/code&gt;&lt;/pre&gt;
&lt;/div&gt;
</description><link>http://bytescrolls.blogspot.com/2016/04/orient-db-simple-etl-use-case-note.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh_Fo6z2rqGICTsIsE_HgAc6DMC0GkTVEAl5Sd7M_4VQLfQvopK75dJXE26yixnlSV6Qj-BnrMQ8w4V0-dqG6RzPr8I-_zptMuZzArUUpaLWaFTB6uGXBNVBsTTX_Ly74FRUihqpLuvFrY/s72-w995-h138-c-no/" height="72" width="72"/><thr:total>5</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-4718056570452964530</guid><pubDate>Fri, 04 Dec 2015 13:25:00 +0000</pubDate><atom:updated>2015-12-04T18:55:22.555+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">scala</category><category domain="http://www.blogger.com/atom/ns#">spark</category><category domain="http://www.blogger.com/atom/ns#">zeppelin</category><title>Analytics by SQL and Spark using Apache Zeppelin</title><description>&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhMgqn92RpJDwimWZtIR_L81OiolzXu2s8ny5V3-Hewzl_Tj7N8JEfsz_YjTnrUi4JhgJUG6tjQ1K44fInIPITjOSeN9dmf0t33FOq8V1L1GR58SP4NMhW41CILYwi4Wccg47o9aLjG3kD3/s1600/zeppelin-bl.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhMgqn92RpJDwimWZtIR_L81OiolzXu2s8ny5V3-Hewzl_Tj7N8JEfsz_YjTnrUi4JhgJUG6tjQ1K44fInIPITjOSeN9dmf0t33FOq8V1L1GR58SP4NMhW41CILYwi4Wccg47o9aLjG3kD3/s1600/zeppelin-bl.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
#spark #hadoop #analytics #apache #zeppelin #scala&lt;br /&gt;
&lt;br /&gt;
I was looking for a cool dashboard based query interface for analytics. I stumbled upon a cool open source project called &lt;a href=&quot;https://zeppelin.incubator.apache.org/&quot; target=&quot;_blank&quot;&gt;Apache Zeppelin&lt;/a&gt;,&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
Zeppelin is a modern web-based tool for the data scientists to collaborate over large-scale data exploration and visualization projects. It is a notebook style interpreter that enable collaborative analysis sessions sharing between users. Zeppelin is independent of the execution framework itself. Current version runs on top of Apache Spark but it has pluggable interpreter APIs to support other data processing systems. More execution frameworks could be added at a later date i.e Apache Flink, Crunch as well as SQL-like backends such as Hive, Tajo, MRQL.&lt;/blockquote&gt;
&lt;br /&gt;
As their &lt;a href=&quot;https://wiki.apache.org/incubator/ZeppelinProposal&quot; target=&quot;_blank&quot;&gt;apache proposal&lt;/a&gt; mentioned, it does have good support for pluggable interpreters (a lot), ie. you can query files, databases, hadoop etc using this interface seamlessly. This application is easily executable in you workstation, if you want to try out. Download from the project site and follow the &lt;a href=&quot;https://zeppelin.incubator.apache.org/docs/0.5.5-incubating/install/install.html&quot; target=&quot;_blank&quot;&gt;installation guide&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
Run the zeppelin server daemon, and access the UI at http://localhost:8088&lt;br /&gt;
&lt;br /&gt;
We can use different interpreters in notebooks and display the results in dashboard. I was interested in plain simple SQL db, like postgre. &lt;br /&gt;
&lt;br /&gt;
create a tables sales and insert some sample data.&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
create table sales(category varchar, units integer);&lt;br /&gt;
insert into sales values(&#39;Men-Shirts&#39;, 134344);&lt;br /&gt;
insert into sales values(&#39;Men-Shoes&#39;, 56289);&lt;br /&gt;
insert into sales values(&#39;Men-Wallets&#39;, 19377);&lt;br /&gt;
insert into sales values(&#39;Men-Watches&#39;, 345673);&lt;br /&gt;
insert into sales values(&#39;Women-Shirts&#39;, 87477);&lt;br /&gt;
insert into sales values(&#39;Women-Skirts&#39;, 140533);&lt;br /&gt;
insert into sales values(&#39;Women-Shoes&#39;, 77301);&lt;br /&gt;
insert into sales values(&#39;Electronics-Mobile&#39;, 67457);&lt;br /&gt;
insert into sales values(&#39;Electronics-Tablets&#39;, 21983);&lt;br /&gt;
insert into sales values(&#39;Electronics-Accessories&#39;, 865390);&lt;/blockquote&gt;
&lt;br /&gt;
Create a notebook,&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgNzLkWJY1CbKyNOSCPzB7lnZjSSDqQ2Ce4vdWmMp4eMCh1uQP-dzAVDOu9Q784-Su9RsNt8lHg114bZFdxF7qA9WYWGpOuPtjl63zbPQOqPWVzyrOmnK1A7lHT1WXdlTjBczf51dRqC-c/s1600/Screen+Shot+2015-12-04+at+4.28.12+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;56&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgNzLkWJY1CbKyNOSCPzB7lnZjSSDqQ2Ce4vdWmMp4eMCh1uQP-dzAVDOu9Q784-Su9RsNt8lHg114bZFdxF7qA9WYWGpOuPtjl63zbPQOqPWVzyrOmnK1A7lHT1WXdlTjBczf51dRqC-c/s640/Screen+Shot+2015-12-04+at+4.28.12+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
setup the connection properties in psql interpreter configuration.&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjgVxEgeJqQKuwrP0q6B2iC0fIY8m_RJx5q3bv2ERRL4KxB1suyGf4evJmdTFsXo3pWfxU1beOO5r8_Hyzqw2-Ej29NmUlgeVkqbh2bsgEkEzaOAEDTyuaUokTkrrP3naJZ3oL26Bef_iQ/s1600/Screen+Shot+2015-12-04+at+4.28.27+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;233&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjgVxEgeJqQKuwrP0q6B2iC0fIY8m_RJx5q3bv2ERRL4KxB1suyGf4evJmdTFsXo3pWfxU1beOO5r8_Hyzqw2-Ej29NmUlgeVkqbh2bsgEkEzaOAEDTyuaUokTkrrP3naJZ3oL26Bef_iQ/s640/Screen+Shot+2015-12-04+at+4.28.27+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
and run with %psql interpreter. In the notebook, type in,&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
%psql&amp;nbsp; select * from sales&lt;/blockquote&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhFrqZPv3WRAy6oPIkA5_6uaRH2XX5grGmkhb5yVkazieTgiqABzgWZa7fU6PIQg7Aeqt0wGLM6mj0jV1_yNc9f6BeKTGZb54cb5mZKnFv2GoQJwXD9nqvSNNGQLt6A4MI7gfKFZj-ikL8/s1600/Screen+Shot+2015-12-04+at+6.23.30+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;324&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhFrqZPv3WRAy6oPIkA5_6uaRH2XX5grGmkhb5yVkazieTgiqABzgWZa7fU6PIQg7Aeqt0wGLM6mj0jV1_yNc9f6BeKTGZb54cb5mZKnFv2GoQJwXD9nqvSNNGQLt6A4MI7gfKFZj-ikL8/s640/Screen+Shot+2015-12-04+at+6.23.30+PM.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
You have the dashboard ready. You can share the graph as a link and run the notebook scheduled.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhlc_oItdrnGxO86UKWPK2igfq5hg1ydP0CB6HRkno1oQwMA4xVVWlmB8lVCi6YzBbnLgN3618dqZTOcnW-0vGbf_QBplSShhfFlpEEQUzzO-jFO63H7SyqhKyrHaLS1elju-SffborPok/s1600/Screen+Shot+2015-12-04+at+5.31.34+PM.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;124&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhlc_oItdrnGxO86UKWPK2igfq5hg1ydP0CB6HRkno1oQwMA4xVVWlmB8lVCi6YzBbnLgN3618dqZTOcnW-0vGbf_QBplSShhfFlpEEQUzzO-jFO63H7SyqhKyrHaLS1elju-SffborPok/s1600/Screen+Shot+2015-12-04+at+5.31.34+PM.png&quot; width=&quot;840&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Then I decided to use the spark code. As it supports jdbc source, use that in the spark context. In Spark, JdbcRDD can be used to connect with a relational data source. RDDs are a unit of compute and storage in Spark but lack any information about the structure of the data i.e. schema. Dataframes combine RDDs with Schema. To support postgre as source, you need the driver loaded to execute the queries or building schema. Copy the driver to &lt;i&gt;$ZEPLLIN_HOME/interpreter/spark&lt;/i&gt; and restart the daemon. If you don&#39;t do this, you will not be able to source postgre and may get jdbc connection errors like &quot;No suitable driver found&quot; etc.&lt;br /&gt;
&lt;br /&gt;
Use the notebook to provide the spark code,&lt;br /&gt;
&lt;br /&gt;
In the %sql (to be noted, its not %psql) interpreter provide,&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote&gt;
%sql select * from sales&lt;/blockquote&gt;
&lt;br /&gt;
You have to schedule the %sql notebook only and the dashboard is updated based on the data inserts when the cron job is &lt;a href=&quot;http://www.quartz-scheduler.org/documentation/quartz-1.x/tutorials/crontrigger&quot; target=&quot;_blank&quot;&gt;triggered&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;</description><link>http://bytescrolls.blogspot.com/2015/12/analytics-by-sql-and-spark-using-apache.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhMgqn92RpJDwimWZtIR_L81OiolzXu2s8ny5V3-Hewzl_Tj7N8JEfsz_YjTnrUi4JhgJUG6tjQ1K44fInIPITjOSeN9dmf0t33FOq8V1L1GR58SP4NMhW41CILYwi4Wccg47o9aLjG3kD3/s72-c/zeppelin-bl.png" height="72" width="72"/><thr:total>2</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-1535928072494861350</guid><pubDate>Thu, 10 Sep 2015 12:34:00 +0000</pubDate><atom:updated>2015-09-10T18:04:04.588+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">json</category><category domain="http://www.blogger.com/atom/ns#">scala</category><title>Json parsing, Scala way</title><description>&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjszJvPOM1MSVFNStCZ8xgLiZPUkkRWPvWYKflPpZkhNOXsFcHMbSs9Lni47Tj4RCpgu-L18w9at0uwl6R5AV9_qXczQ2FXTE2c_cxlZwjxNUe10bDs77py8pa3bLn1XAo3ua9K9kj3dGc/s1600/scala-api.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;145&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjszJvPOM1MSVFNStCZ8xgLiZPUkkRWPvWYKflPpZkhNOXsFcHMbSs9Lni47Tj4RCpgu-L18w9at0uwl6R5AV9_qXczQ2FXTE2c_cxlZwjxNUe10bDs77py8pa3bLn1XAo3ua9K9kj3dGc/s320/scala-api.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Most java developers are familiar with json parsing and object mapping using &lt;a href=&quot;http://wiki.fasterxml.com/JacksonHome&quot; target=&quot;_blank&quot;&gt;Jackson library&lt;/a&gt;&#39;s &lt;a href=&quot;http://fasterxml.github.io/jackson-databind/javadoc/2.5/com/fasterxml/jackson/databind/ObjectMapper.html&quot; target=&quot;_blank&quot;&gt;object mapper &lt;/a&gt;functionality that enables serializing POJOs to json string and back. In scala, using the play&#39;s json &lt;a href=&quot;http://mandubian.com/2012/11/11/JSON-inception/&quot; target=&quot;_blank&quot;&gt;inception&lt;/a&gt; mechanism provides a subtle way to serialize json. Using the powerful &lt;a href=&quot;http://docs.scala-lang.org/overviews/macros/overview.html&quot; target=&quot;_blank&quot;&gt;Scala macros&lt;/a&gt;, (a macro is a piece of Scala code, executed at compile-time, which manipulates and modifies the AST of a Scala compile-time metaprogramming), it is able to introspect code at compile-time based on Scala reflection API, access all imports, implicits in the current compile context and generate code. This means the case classes are automatically serialized to json. Also, you can explicitly provide the path to json key and map the value to object&#39;s field. But, for simple case classes they are just another boiler-plate code. Use it when we need more powerful mapping and logic for serialized fields. So how does this mapping works? The compiler will inject code into compiled &lt;a href=&quot;http://docs.scala-lang.org/overviews/reflection/symbols-trees-types.html&quot; target=&quot;_blank&quot;&gt;scala AST&lt;/a&gt; (Absract Syntax Tree) as the &lt;a href=&quot;http://docs.scala-lang.org/overviews/macros/paradise.html&quot; target=&quot;_blank&quot;&gt;macro-compiler&lt;/a&gt; replaces, say, Json.reads[T] by injecting into compile chain and eventually writes out the code for mapping fields in json to object. Internally, play&#39;s json module use Jackson&#39;s object mapper (ref: play.api.libs.json.jackson.JacksonJson).&amp;nbsp;&lt;/div&gt;
&lt;br /&gt;
You can add dependency in build.sbt in a minimal-scala project which will provide Json APIs from play framework -&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&amp;nbsp; &quot;com.typesafe.play&quot; %% &quot;play-ws&quot; % &quot;2.4.2&quot; withSources()&lt;/blockquote&gt;
&lt;br /&gt;
For eg, if we have to two classes (in this case class),&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
case class Region(name: String, state: Option[String])&lt;br /&gt;
case class Sales(count: Int, region: Region)&lt;/blockquote&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
You have to add the implicit &amp;nbsp;methods for reading and writing to and from json and objects. The methods marked implicit will be inserted for you by the compiler and type is inferred from the context. Any compilation will fail if no implicit value of the right type is available in scope.&lt;/div&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
implicit val readRegion = Json.reads[Region]&lt;br /&gt;
implicit val readSales = Json.reads[Sales]&lt;br /&gt;
implicit val writeRegion = Json.writes[Region]&lt;br /&gt;
implicit val writeSales = Json.writes[Sales]&lt;/blockquote&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
If you interchange the order, from readRegion and readSales, you will get compilation error.As the compiler creates a Reads[T] by resolving case class fields &amp;amp; required implicits at COMPILE-time, If any missing implicit is discovered, compiler will break with corresponding error.&lt;/div&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&amp;nbsp;Error:(12, 38) No implicit format for test.Region available.&lt;br /&gt;
&amp;nbsp; &amp;nbsp;implicit val readSales = Json.reads[Sales]&lt;/blockquote&gt;
&amp;nbsp; &lt;br /&gt;
&lt;br /&gt;
Interesting method to try is the validate() method while converting json to object which will help to pin point the path of error.&lt;br /&gt;
&lt;br /&gt;
Executing the following program:&lt;br /&gt;
&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/devhary/091e9556aa1b7c0d67b2.js&quot;&gt;&lt;/script&gt;

&lt;br /&gt;
&lt;br /&gt;
Results:&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
This is testing json..&lt;br /&gt;
Test 1&lt;br /&gt;
-------&lt;br /&gt;
Result:Some(Sales(123,Region(West,None)))&lt;br /&gt;
Test 2&lt;br /&gt;
-------&lt;br /&gt;
Error at JsPath: /region/name&lt;br /&gt;
error.path.missing&lt;br /&gt;
()&lt;br /&gt;
Result:None&lt;br /&gt;
Test 3&lt;br /&gt;
------&lt;br /&gt;
Error at JsPath: /count&lt;br /&gt;
error.expected.jsnumber&lt;br /&gt;
Error at JsPath: /region/name&lt;br /&gt;
error.expected.jsstring&lt;br /&gt;
()&lt;br /&gt;
Result:None&lt;br /&gt;
Test 4&lt;br /&gt;
------&lt;br /&gt;
Result:{&quot;count&quot;:123,&quot;region&quot;:{&quot;name&quot;:&quot;West&quot;,&quot;state&quot;:&quot;California&quot;}}&lt;br /&gt;
Process finished with exit code 0&lt;/blockquote&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;</description><link>http://bytescrolls.blogspot.com/2015/09/json-parsing-scala-way.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjszJvPOM1MSVFNStCZ8xgLiZPUkkRWPvWYKflPpZkhNOXsFcHMbSs9Lni47Tj4RCpgu-L18w9at0uwl6R5AV9_qXczQ2FXTE2c_cxlZwjxNUe10bDs77py8pa3bLn1XAo3ua9K9kj3dGc/s72-c/scala-api.png" height="72" width="72"/><thr:total>2</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-8120047376639120073</guid><pubDate>Sun, 05 May 2013 11:32:00 +0000</pubDate><atom:updated>2013-05-05T17:02:49.964+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">hive</category><title>Simple metastore creation for Hive in MySQL</title><description>&lt;br /&gt;
For Hive, the&amp;nbsp;meta-store&amp;nbsp;is like the system catalog which contains metadata about the tables stored in Hive. This metadata is speciﬁed during table creation and reused every time the table is referenced in HiveQL. The database is a namespace for tables, where ‘default’ is used for tables with no user supplied database name. The metadata for table contains list of columns and their types, owner, storage and SerDe information (which I can detail in future posts). It can also contain any user supplied key and value data; which can be used for table statistics. Storage information includes location of the table’s data in the underlying ﬁle system, data formats and bucketing information. SerDe (which controls how Hive serializes/deserializes the data in a row) metadata includes the implementation class of serializer and deserializer methods and any supporting information required by that implementation. The partitions can have its own columns and SerDe and storage information which can be used in the future to evolve Hive schema.The metastore uses either a traditional relational database (like MySQL, Oracle) or ﬁle system and not HDFS since it is optimized for&amp;nbsp;sequential&amp;nbsp;scans only),thus the fired HiveQL statements are executed slow which only access metadata objects.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
its simple to install the metastore.&lt;br /&gt;
&lt;br /&gt;
-install mysql-conector&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
$ sudo yum install mysql-connector-java&lt;/blockquote&gt;
-create a symbolic link in the Hive 
directory&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
$ ln -s /usr/share/java/mysql-connector-java.jar /usr/lib/hive/lib/mysqlconnector-java.jar&lt;/blockquote&gt;
&lt;br /&gt;
-create the database for the Hive metastore.cdh4 ships with scripts for derby,mysql,oracle and postgre&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
$ mysql -u root -p&lt;br /&gt;
mysql&amp;gt; CREATE DATABASE hivemetastoredb;&lt;br /&gt;
mysql&amp;gt; USE hivemetastoredb;&lt;br /&gt;
mysql&amp;gt; SOURCE /usr/lib/hive/scripts/metastore/upgrade/mysql/hive-schema- 0.9.0.mysql.sql;&lt;/blockquote&gt;
&lt;br /&gt;
-create a user for the metastore &lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
mysql&amp;gt;CREATE USER &#39;hive&#39;@&#39;%&#39; IDENTIFIED BY &#39;hive&#39;;&lt;/blockquote&gt;
&lt;br /&gt;
-grant access for all hosts in the network&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
mysql&amp;gt; GRANT ALL PRIVILEGES ON hivemetastoredb.* TO hive@&#39;&lt;ip-mask&gt;%&#39; WITH 
GRANT OPTION;&lt;/ip-mask&gt;&lt;br /&gt;
mysql&amp;gt; FLUSH PRIVILEGES;&lt;/blockquote&gt;
&lt;br /&gt;
following&amp;nbsp;entries&amp;nbsp;in&amp;nbsp;the&amp;nbsp;file&lt;span class=&quot;Apple-tab-span&quot; style=&quot;white-space: pre;&quot;&gt; &lt;/span&gt;/etc/hive/conf/hive-sites.xml, if you are trying a jdbc connection&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;javax.jdo.option.ConnectionURL&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;jdbc:mysql://localhost/hivemetastoredb&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;br /&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;javax.jdo.option.ConnectionDriverName&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;com.mysql.jdbc.Driver&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;br /&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;javax.jdo.option.ConnectionUserName&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;hive&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;br /&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;javax.jdo.option.ConnectionPassword&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;hive&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;br /&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;datanucleus.autoCreateSchema&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;false&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;br /&gt;
&amp;lt;property&amp;gt;&lt;br /&gt;
&amp;lt;name&amp;gt;datanucleus.fixedDatastore&amp;lt;/name&amp;gt;&lt;br /&gt;
&amp;lt;value&amp;gt;true&amp;lt;/value&amp;gt;&lt;br /&gt;
&amp;lt;/property&amp;gt;&lt;/blockquote&gt;
&lt;br /&gt;
&lt;br /&gt;</description><link>http://bytescrolls.blogspot.com/2013/05/simple-metastore-creation-for-hive-in.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>5</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-464704610320775785</guid><pubDate>Tue, 18 Dec 2012 18:32:00 +0000</pubDate><atom:updated>2012-12-19T00:02:49.638+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">bigdata</category><category domain="http://www.blogger.com/atom/ns#">cloud computing</category><category domain="http://www.blogger.com/atom/ns#">data integration</category><title>Data and Brain</title><description>&lt;p&gt;#bigdata&lt;/p&gt;
&lt;p&gt;Came across an interesting presentation on Using Data to Understand Brain.&lt;/p&gt;
&lt;p&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;iframe width=&quot;427&quot; height=&quot;356&quot; scrolling=&quot;no&quot; marginheight=&quot;0&quot; marginwidth=&quot;0&quot; frameborder=&quot;0&quot; src=&quot;http://www.slideshare.net/slideshow/embed_code/12321459&quot;&gt; &lt;/iframe&gt;&lt;/p&gt;
&lt;div style=&quot;margin-bottom: 5px;&quot;&gt;&lt;strong&gt; &lt;a title=&quot;Using Data to Understand the Brain&quot; href=&quot;http://www.slideshare.net/jakehofman/using-data-to-understand-the-brain&quot; target=&quot;_blank&quot;&gt;Using Data to Understand the Brain&lt;/a&gt; &lt;/strong&gt; from &lt;strong&gt;&lt;a href=&quot;http://www.slideshare.net/jakehofman&quot; target=&quot;_blank&quot;&gt;jakehofman&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;p&gt;&lt;br /&gt; &lt;iframe width=&quot;560&quot; height=&quot;315&quot; frameborder=&quot;0&quot; src=&quot;http://www.youtube.com/embed/uPt4SzH6eeI&quot;&gt;&lt;/iframe&gt;&lt;/p&gt;
&lt;p&gt;Is it possible to read your brain? &lt;a href=&quot;http://gizmodo.com/5922208/scientists-invent-mind+reading-system-that-lets-you-type-with-your-brain&quot;&gt;hmmm &lt;/a&gt;&lt;/p&gt;
&lt;p&gt;I am a little two-faced with these riddles....&lt;/p&gt;
&lt;p&gt;&lt;br /&gt; &lt;iframe width=&quot;480&quot; height=&quot;270&quot; frameborder=&quot;0&quot; src=&quot;http://www.youtube.com/embed/VzPXtZiQXSc?fs=1&quot;&gt;&lt;/iframe&gt;&lt;/p&gt;</description><link>http://bytescrolls.blogspot.com/2012/12/data-and-brain.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://img.youtube.com/vi/uPt4SzH6eeI/default.jpg" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-5215268655157942034</guid><pubDate>Mon, 17 Dec 2012 19:19:00 +0000</pubDate><atom:updated>2012-12-18T00:49:24.911+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">algorithms</category><category domain="http://www.blogger.com/atom/ns#">distributed systems</category><category domain="http://www.blogger.com/atom/ns#">nosql</category><category domain="http://www.blogger.com/atom/ns#">programming</category><title>Eventual Consistency</title><description>#distributed #nosql

&lt;br /&gt;
&lt;iframe height=&quot;780&quot; src=&quot;http://docs.google.com/viewer?url=http%3A%2F%2Fwww.cs.brown.edu%2Fcourses%2Fcsci2950-u%2Ff10%2Fpapers%2Fp40-vogels.pdf&amp;amp;embedded=true&quot; style=&quot;border: none;&quot; width=&quot;600&quot;&gt;&lt;/iframe&gt;

&lt;br /&gt;
&lt;iframe height=&quot;780&quot; src=&quot;http://docs.google.com/viewer?url=http%3A%2F%2Fwiki.ubc.ca%2Fimages%2Fe%2Fec%2FEventualConsistency.pdf&amp;amp;embedded=true&quot; style=&quot;border: none;&quot; width=&quot;600&quot;&gt;&lt;/iframe&gt;

</description><link>http://bytescrolls.blogspot.com/2012/12/eventual-consistency.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-36199754222380298</guid><pubDate>Sun, 16 Dec 2012 20:38:00 +0000</pubDate><atom:updated>2012-12-17T02:08:53.863+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">programming</category><category domain="http://www.blogger.com/atom/ns#">unicode</category><title>Unicode features in various languages</title><description>Here’s what each language natively supports &lt;i&gt;in its standard distribution&lt;/i&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;span style=&quot;font-size: small;&quot;&gt;
&lt;table&gt;
&lt;/table&gt;
&lt;table align=&quot;center&quot; border-style=&quot;solid&quot; border=&quot;1&quot; cellpadding=&quot;4&quot; cellspacing=&quot;2&quot;&gt;

&lt;tbody&gt;
&lt;tr&gt;
&lt;th&gt;&lt;i&gt;Unicode&lt;/i&gt;&lt;/th&gt;
&lt;td&gt;&lt;span style=&quot;font-size: small;&quot;&gt;J&lt;/span&gt;avascript&lt;/td&gt;
&lt;td align=&quot;center&quot;&gt;&lt;b&gt;ᴘʜᴘ&lt;/b&gt;&lt;/td&gt;
&lt;th&gt;Go&lt;/th&gt;
&lt;td&gt;&amp;nbsp;&lt;b&gt;Ruby&lt;/b&gt;&lt;/td&gt;
&lt;td&gt;&amp;nbsp;&lt;b&gt;Python&lt;/b&gt;&lt;/td&gt;
&lt;td&gt;☕  &lt;b&gt;Java&lt;/b&gt;
&lt;/td&gt;&lt;td&gt;&amp;nbsp;&lt;b&gt;Perl&lt;/b&gt;
&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Internally&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;UCS‐2 &lt;i&gt;or&lt;/i&gt;&lt;br /&gt;
UTF‐16&lt;/td&gt;      
&lt;td align=&quot;center&quot;&gt;UTF‐8⁻&lt;/td&gt;     
&lt;td align=&quot;center&quot;&gt;UTF‐8&lt;/td&gt;      
&lt;td align=&quot;center&quot;&gt;varies&lt;/td&gt;     
&lt;td align=&quot;center&quot;&gt;UCS‐2 &lt;i&gt;or&lt;/i&gt;&lt;br /&gt;
UCS‐4&lt;/td&gt;      
&lt;td align=&quot;center&quot;&gt;UTF‐16&lt;/td&gt;     
&lt;td align=&quot;center&quot;&gt;UTF‐8⁺&lt;/td&gt;     
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Identiﬁers&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✅&lt;/span&gt;&lt;sup&gt;∓&lt;/sup&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Casefolding&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;none&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;full&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;none&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;full&lt;/td&gt;         
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Casemapping&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;simple&lt;sup&gt;∓&lt;/sup&gt;&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;full&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;simple&lt;/td&gt;       
&lt;td align=&quot;center&quot;&gt;full&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;full&lt;/td&gt;         
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Graphemes&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✅&lt;/span&gt;&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Normalization&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─⁺&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;&lt;/td&gt;         
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;UCA Collation&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;✔⁺&lt;/td&gt;        
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Named Characters&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✅&lt;/span&gt;&lt;/td&gt;            
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;&lt;span style=&quot;font-family: Symbola;&quot;&gt;✔&lt;/span&gt;⁺&lt;/td&gt;        
&lt;/tr&gt;
&lt;tr&gt; &lt;th&gt;Properties&lt;/th&gt;
&lt;td align=&quot;center&quot;&gt;─&lt;/td&gt;        
&lt;td align=&quot;center&quot;&gt;two&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;(non‐regex)⁻&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;three&lt;/td&gt;         
&lt;td align=&quot;center&quot;&gt;(non‐regex)⁻&lt;/td&gt;                 
&lt;td align=&quot;center&quot;&gt;two⁺&lt;/td&gt;    
&lt;td align=&quot;center&quot;&gt;every⁺&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;/span&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://www.oscon.com/oscon2012/public/schedule/detail/24252&quot; target=&quot;_blank&quot;&gt;from &lt;b&gt;Tom Christiansen Unicode Support Shootout: The Good, the Bad, the Mostly Ugly&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Grapheme &lt;/b&gt;-&amp;nbsp; A grapheme is the smallest semantically distinguishing unit in a written language, analogous to the phonemes of spoken languages.&lt;br /&gt;
&lt;br /&gt;
&lt;iframe allowfullscreen=&quot;allowfullscreen&quot; frameborder=&quot;0&quot; height=&quot;315&quot; src=&quot;http://www.youtube.com/embed/BiJR5UOBueY&quot; width=&quot;560&quot;&gt;&lt;/iframe&gt;

&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Casefolding &lt;/b&gt;- Unicode defines case folding through the three case-mapping properties of each character: uppercase, lowercase and titlecase. These properties relate all characters in scripts with differing cases to the other case variants of the character.&lt;br /&gt;
&lt;br /&gt;
&lt;iframe allowfullscreen=&quot;allowfullscreen&quot; frameborder=&quot;0&quot; height=&quot;315&quot; src=&quot;http://www.youtube.com/embed/4KMlJgGrfzQ&quot; width=&quot;560&quot;&gt;&lt;/iframe&gt;

&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Case mapping&lt;/b&gt; - is used to handle the mapping of upper-case, lower-case, and title case characters for a given language.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;What is the difference between case mapping and case folding&lt;/b&gt;?

Case mapping or case conversion is a process whereby strings are converted to a particular form—uppercase, lowercase, or titlecase—possibly for display to the user. Case folding is primarily used for caseless comparison of text, such as identifiers in a computer program, rather than actual text transformation. Case folding in Unicode is based on the lowercase mapping, but includes additional changes to the source text to help make it language-insensitive and consistent. As a result, case-folded text should be used solely for internal processing and generally should not be stored or displayed to the end user.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;table align=&quot;center&quot; cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;margin-left: auto; margin-right: auto; text-align: center;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiViaf7LYv_PODqB8lSRaQFUCzvgUmpHtxAHtnZSd4XCRPnPZ4avEQ0UEhblFLRfihOQqE9FNjXTQTN4FEqCcpbD-Yfp-oE3dtcvm0ehspDSre0MVPWIaU_RgGk_H5U3F9pa9vVE1TjsIc/s1600/Unicode-6-Normalization.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;213&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiViaf7LYv_PODqB8lSRaQFUCzvgUmpHtxAHtnZSd4XCRPnPZ4avEQ0UEhblFLRfihOQqE9FNjXTQTN4FEqCcpbD-Yfp-oE3dtcvm0ehspDSre0MVPWIaU_RgGk_H5U3F9pa9vVE1TjsIc/s320/Unicode-6-Normalization.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;Normalization - &lt;a href=&quot;http://wiki.secondlife.com/wiki/Unicode_In_5_Minutes&quot; target=&quot;_blank&quot;&gt;courtesy&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
&lt;b&gt;Normalization &lt;/b&gt;- Unicode has encoded many entities that are really variants of existing nominal characters. The visual representations of these characters are typically a subset of the possible visual representations of the nominal character.
&lt;a href=&quot;http://www.unicode.org/reports/tr15/tr15-26.html&quot; target=&quot;_blank&quot;&gt;more &lt;/a&gt;-&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;UCA Collation&lt;/b&gt; - Collation is the general term for the process and function of determining the sorting order of strings of characters. It is a key function in computer systems; whenever a list of strings is presented to users, they are likely to want it in a sorted order so that they can easily and reliably find individual strings. Thus it is widely used in user interfaces. It is also crucial for databases, both in sorting records and in selecting sets of records with fields within given bounds.The Unicode collation algorithm (UCA) is an algorithm defined in Unicode Technical Report #10, which defines a customizable method to compare two strings. These comparisons can then be used to collate or sort text in any writing system and language that can be represented with Unicode. &lt;a href=&quot;http://www.unicode.org/reports/tr10/&quot; target=&quot;_blank&quot;&gt;more&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Named Characters&lt;/b&gt; - Unicode characters are assigned a unique Name (na). The name, in English, is composed of A-Z capitals, 0-9 digits, - (hyphen-minus) and &lt;space&gt;.The Unicode Standard specifies notational conventions for referring to sequences of characters (or code points) treated as a unit, using angle brackets surrounding a comma-delimited list of code points, code points plus character names, and so on. For example, both of the designations in Table 1 refer to a combining character sequence consisting of the letter “a” with a circumflex and an acute accent applied to it. &lt;a href=&quot;http://www.unicode.org/reports/tr34/&quot; target=&quot;_blank&quot;&gt;more&lt;/a&gt;&amp;nbsp; &lt;a href=&quot;http://www.unicode.org/charts/charindex.html&quot; target=&quot;_blank&quot;&gt;more&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;

&lt;br /&gt;

&lt;b&gt;Properties&lt;/b&gt; - Each Unicode character belongs to a certain category. Unicode assigns character properties to each code point. These properties can be used to handle &quot;characters&quot; (code points) in processes, like in line-breaking, script direction right-to-left or applying controls.&lt;a href=&quot;http://www.unicode.org/reports/tr44/&quot;&gt;more&lt;/a&gt;
&lt;br /&gt;
Perl looks cool!&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;/space&gt;&lt;br /&gt;
&lt;space&gt;&lt;/space&gt;&lt;br /&gt;
&lt;space&gt;&lt;iframe height=&quot;780&quot; src=&quot;http://docs.google.com/viewer?url=http%3A%2F%2F98.245.80.27%2Ftcpc%2FOSCON2011%2Fgbu.pdf&amp;amp;embedded=true&quot; style=&quot;border: none;&quot; width=&quot;600&quot;&gt;&lt;/iframe&gt;&lt;/space&gt;
&lt;br /&gt;
&lt;br /&gt;

&lt;iframe allowfullscreen=&quot;allowfullscreen&quot; frameborder=&quot;0&quot; height=&quot;315&quot; src=&quot;http://www.youtube.com/embed/74Ls6CmJJSE&quot; width=&quot;420&quot;&gt;&lt;/iframe&gt;</description><link>http://bytescrolls.blogspot.com/2012/12/unicode-features-in-various-languages.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://img.youtube.com/vi/BiJR5UOBueY/default.jpg" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-1886440465376439322</guid><pubDate>Tue, 31 Jul 2012 19:11:00 +0000</pubDate><atom:updated>2012-08-01T00:42:58.031+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">big data</category><title>Machine generated data</title><description>At first, the term &quot;&lt;a href=&quot;http://en.wikipedia.org/wiki/Machine-generated_data&quot;&gt;machine-generated&amp;nbsp; data&lt;/a&gt;&quot; can be confusing. One would think,&amp;nbsp; &lt;a href=&quot;http://www.guardian.co.uk/news/datablog/2010/jul/16/data-plural-singular&quot;&gt;every data is (or are?)&amp;nbsp;&lt;/a&gt; generated from one device or another is provided by an innocent mortal in this so called era of social media and big data. Then, there should be a clear distinction to such definitions. If an user enter some data in a form, then it is not considered machine generated. At the same time, the same application can track the user&#39;s location and log it in a remote server. So it becomes the machine generated data.&lt;br /&gt;
&lt;br /&gt;
Wikipedia says,&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;&lt;b&gt;Machine-generated data&lt;/b&gt; (MGD) is the generic term for &lt;a href=&quot;http://en.wikipedia.org/wiki/Information&quot; title=&quot;Information&quot;&gt;information&lt;/a&gt; which was automatically created from a &lt;a class=&quot;mw-redirect&quot; href=&quot;http://en.wikipedia.org/wiki/Computer_process&quot; title=&quot;Computer process&quot;&gt;computer process&lt;/a&gt;, &lt;a class=&quot;mw-redirect&quot; href=&quot;http://en.wikipedia.org/wiki/Computer_application&quot; title=&quot;Computer application&quot;&gt;application&lt;/a&gt;, or other machine without the intervention of a human.&amp;nbsp;&lt;/i&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;&lt;/i&gt;&lt;br /&gt;
According to Monash Research,&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;In classical &lt;b&gt;human-generated data,&lt;/b&gt; what’s recorded is the &lt;b&gt;direct result of human choices. &lt;/b&gt;Somebody
 buys something, makes an inquiry about it, fills an order from 
inventory, makes a payment in return for the object, makes a bank 
deposit to have funds for the next purchase, or promotes a manager who’s
 been particularly successful at selling stuff. Database updates ensue. 
Computers memorialize these human actions more quickly and cheaply than 
humans carry them out. Plenty of difficulties can occur with that kind 
of automation — applications are commonly too inflexible or confusing — 
but keeping up with data volumes is generally the least of the problems.&lt;/i&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
So what are they? Are they stream of logs flowing through the information super waterway?&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;http://upload.wikimedia.org/wikipedia/commons/thumb/5/5b/Joensuun_kanava2.jpg/320px-Joensuun_kanava2.jpg&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;http://upload.wikimedia.org/wikipedia/commons/thumb/5/5b/Joensuun_kanava2.jpg/320px-Joensuun_kanava2.jpg&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
May be, until they churned into some books or toilet rolls!&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://www.webopedia.com/TERM/A/application_log.html&quot;&gt;Application Logs&lt;/a&gt; - Logs generated by web or desktop applications. The server side logs used for debugging and support tickets!&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://en.wikipedia.org/wiki/Call_detail_record&quot;&gt;Call Detail Records&lt;/a&gt; - The ones recorded your telecom company. They contain useful details of the call or service that passed through the switch etc like the phone number of the calls, its duration etc. Needed for billing.&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://httpd.apache.org/docs/2.2/logs.html&quot;&gt;Web logs&lt;/a&gt; - use to count the visitors and similar web analytics done on these data &lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://docs.oracle.com/cd/B10501_01/server.920/a96521/audit.htm&quot;&gt;Database Audit Logs&lt;/a&gt; - Enable auditing to audit for suspicious database activity, it is common that not much information is available to target specific users or schema objects&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://en.wikipedia.org/wiki/Event_Viewer&quot;&gt;OS logs&lt;/a&gt; - tracks crashing or errors &lt;br /&gt;
&lt;br /&gt;
There are many similar generated data by different application and systems like RFIDs, sensors etc. Then these messages can be mashed up. For the machine data, there will be structure or format and semantics based on the domain it relies on.&lt;br /&gt;
&lt;br /&gt;
The growth of such data is fast and continuous. As it is a stream of data and like a history they are not changed. They are like a record of events.&lt;br /&gt;
&lt;br /&gt;
&lt;table align=&quot;center&quot; cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;margin-left: auto; margin-right: auto; text-align: center;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgYejrqvIWjZEms_JOiP1k9Cs_llo-7qiER95t1Jwj1QPXAkpRaYqcQE0vopnWZOoNVMOAHYHU7hGs7Nkl1grswQBxJvlRVdjzbvUWkbSG1iQjC5bUyRBoavVgdhEieDEyf35tmz4ObOf8/s1600/Machine-Generated-Data.jpg&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;164&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgYejrqvIWjZEms_JOiP1k9Cs_llo-7qiER95t1Jwj1QPXAkpRaYqcQE0vopnWZOoNVMOAHYHU7hGs7Nkl1grswQBxJvlRVdjzbvUWkbSG1iQjC5bUyRBoavVgdhEieDEyf35tmz4ObOf8/s320/Machine-Generated-Data.jpg&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;courtesy- &lt;a href=&quot;http://www.infobright.org/images/uploads/blogs/community/Machine-Generated-Data.jpg&quot;&gt;link&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;Anyone tried &lt;a href=&quot;https://github.com/petewarden/iPhoneTracker&quot;&gt;iPhonetracker&lt;/a&gt;?&lt;br /&gt;
&lt;br /&gt;
&lt;table align=&quot;center&quot; cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;margin-left: auto; margin-right: auto; text-align: center;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiVcyjzferQ2YUrAKYkGknuYHX9O9pXM2QCIW8v7aD9ByqTBsubnVC8hj78OISmN3diHEa-8Dlo6jBxsMgPBfgd6n2wc81vruTmGbbWhILOSpzKN7F6_pRNaQgANJMQuqUFInG8pRG6tH8/s1600/iPhoneTracker-2.jpg&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;237&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiVcyjzferQ2YUrAKYkGknuYHX9O9pXM2QCIW8v7aD9ByqTBsubnVC8hj78OISmN3diHEa-8Dlo6jBxsMgPBfgd6n2wc81vruTmGbbWhILOSpzKN7F6_pRNaQgANJMQuqUFInG8pRG6tH8/s320/iPhoneTracker-2.jpg&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;courtesy- &lt;a href=&quot;http://www.coolinfographics.com/blog/2011/8/18/the-power-of-data-visualization-iphone-tracking.html&quot;&gt;link&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
&lt;a href=&quot;http://dev.w3.org/geo/api/spec-source.html&quot;&gt;Geolocation &lt;/a&gt;and LBS does push a load a data. HTML5 do have a geolocation functionality (even though you have the choice not to track). Following a sample code to test it.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/3219304.js?file=geo.html&quot;&gt;
&lt;/script&gt;</description><link>http://bytescrolls.blogspot.com/2012/08/machine-generated-data.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgYejrqvIWjZEms_JOiP1k9Cs_llo-7qiER95t1Jwj1QPXAkpRaYqcQE0vopnWZOoNVMOAHYHU7hGs7Nkl1grswQBxJvlRVdjzbvUWkbSG1iQjC5bUyRBoavVgdhEieDEyf35tmz4ObOf8/s72-c/Machine-Generated-Data.jpg" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-2688981492982817834</guid><pubDate>Mon, 23 Jul 2012 14:11:00 +0000</pubDate><atom:updated>2012-07-23T19:41:42.441+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">cloud computing</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><title>Nodeable - Realtime Insights</title><description>&lt;a href=&quot;http://www.nodeable.com/&quot;&gt;#Nodeable&lt;/a&gt; is a good example of generating #insights from #bigdata or the real time trickle feeds. It uses&lt;a href=&quot;https://github.com/nathanmarz/storm/&quot;&gt; Twitter&#39;s Storm&lt;/a&gt; for the processing engine Stream reduce. I signed up for a trial account to play around.&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEglZaCSKg4iRHZXyQy0g2aCvtftq0Bav33g0bnHRS9jQSTCar2anWdoSo6sVFeXdOpBfbLRrEM9Fkz7pQ-2HDabTJilCkNI2nzoa2sLxS5Msnitl7cP95W4HChXzewCNhtVCCRwOWTcQsI/s1600/streamreduce.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;243&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEglZaCSKg4iRHZXyQy0g2aCvtftq0Bav33g0bnHRS9jQSTCar2anWdoSo6sVFeXdOpBfbLRrEM9Fkz7pQ-2HDabTJilCkNI2nzoa2sLxS5Msnitl7cP95W4HChXzewCNhtVCCRwOWTcQsI/s320/streamreduce.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
The insights like &quot;Most Active&quot; metrics are generated for &lt;a href=&quot;http://aws.amazon.com/&quot;&gt;Amazon Web services&lt;/a&gt; status. The reports are generated and tagged in real time. The twitter follow counts are displayed.&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiOS86pr6O8qhk5rmbHIIbOSOd6FaeEkPPnPEvgZ1NizfqM0eIX5hZVHm8bkb4SuR0eosan0a08GPfEIXHWMxahpKXRLQeBDHKPtK3CTDBfOcKGTgJgOOVUSpcFh1pnx0hCSsZJ-KmfdYo/s1600/home.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;229&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiOS86pr6O8qhk5rmbHIIbOSOd6FaeEkPPnPEvgZ1NizfqM0eIX5hZVHm8bkb4SuR0eosan0a08GPfEIXHWMxahpKXRLQeBDHKPtK3CTDBfOcKGTgJgOOVUSpcFh1pnx0hCSsZJ-KmfdYo/s320/home.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
It has only some basic set of connectors, but one can create custom connectors using its JSON Schema. The outbound data can be pushed to your own Amazon s3 or &lt;a href=&quot;http://hadoop.apache.org/common/docs/r1.0.3/webhdfs.html&quot;&gt;Hadoop WebHDFS&lt;/a&gt;, which is good for private companies.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjR5INQdqDlgftWk-PASNZhXiq33_8nNgBu8GN2YKNrr_RlIkU2HB8HRKECqyrwe1TBN7J1l3odecn4Iq91v0Cj-hvp5v1VM8ICWJA0OTujGvv7y_bH11iBScmzHPTEUK3GfnqrVnQaKio/s1600/connector.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;215&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjR5INQdqDlgftWk-PASNZhXiq33_8nNgBu8GN2YKNrr_RlIkU2HB8HRKECqyrwe1TBN7J1l3odecn4Iq91v0Cj-hvp5v1VM8ICWJA0OTujGvv7y_bH11iBScmzHPTEUK3GfnqrVnQaKio/s320/connector.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
The github/rss stream is shown as activity stream.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhyZ_k1ngtYBn5rUSP3amsLLXKBk8gL3KpE-uPUA2vtUZXJHPvVR2nWeXooR-pDSkSERSBIQmQdrYhESY3sJIGxV_sQKhctZFUhWDPjn3I13Myilg8NFeXSYitXU-Gow76AEuwUjNfUB_s/s1600/github.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;214&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhyZ_k1ngtYBn5rUSP3amsLLXKBk8gL3KpE-uPUA2vtUZXJHPvVR2nWeXooR-pDSkSERSBIQmQdrYhESY3sJIGxV_sQKhctZFUhWDPjn3I13Myilg8NFeXSYitXU-Gow76AEuwUjNfUB_s/s320/github.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Sharing an interesting presentation of&amp;nbsp; Storm real-time computation.&lt;iframe allowfullscreen=&quot;&quot; frameborder=&quot;0&quot; height=&quot;375&quot; mozallowfullscreen=&quot;&quot; src=&quot;http://player.vimeo.com/video/40972420?title=0&amp;amp;byline=0&amp;amp;portrait=0&quot; webkitallowfullscreen=&quot;&quot; width=&quot;500&quot;&gt;&lt;/iframe&gt; &lt;br /&gt;
&lt;a href=&quot;http://vimeo.com/40972420&quot;&gt;ETE 2012 - Nathan Marz on Storm&lt;/a&gt; from &lt;a href=&quot;http://vimeo.com/chariottechcast&quot;&gt;Chariot Solutions&lt;/a&gt; on &lt;a href=&quot;http://vimeo.com/&quot;&gt;Vimeo&lt;/a&gt;.</description><link>http://bytescrolls.blogspot.com/2012/07/nodeable-realtime-insights.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEglZaCSKg4iRHZXyQy0g2aCvtftq0Bav33g0bnHRS9jQSTCar2anWdoSo6sVFeXdOpBfbLRrEM9Fkz7pQ-2HDabTJilCkNI2nzoa2sLxS5Msnitl7cP95W4HChXzewCNhtVCCRwOWTcQsI/s72-c/streamreduce.png" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-447262574394719167</guid><pubDate>Fri, 20 Apr 2012 18:08:00 +0000</pubDate><atom:updated>2012-04-20T23:38:25.661+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">hadoop</category><title>Hadoop meetup @inmobi Bangalore</title><description>Had a chance to attend the #hadoop #meetup today at #&lt;a href=&quot;http://www.inmobi.com/&quot;&gt;Inmobi &lt;/a&gt;Bangalore.&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;&lt;a href=&quot;http://people.apache.org/%7Eacmurthy/ArunCMurthy.html&quot;&gt;Arun Murthy&lt;/a&gt; and &lt;a href=&quot;http://www.linkedin.com/in/sureshsri&quot;&gt;Suresh Srinivasan&lt;/a&gt; from &lt;a href=&quot;http://hortonworks.com/&quot;&gt;Hortonworks &lt;/a&gt;made presentations on next gen Hadoop and HDFS Namenode High Availability respectively.&lt;br /&gt;
&lt;br /&gt;
From Inmobi, they had presentations on Real time analytics done on HBase and &lt;a href=&quot;https://github.com/sriksun/Ivory&quot;&gt;Ivory&lt;/a&gt;, an opensource&amp;nbsp; feed processing platform by &lt;a href=&quot;http://www.linkedin.com/in/sriksun&quot;&gt;Srikanth&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjM6w-p2atq4Z6AzBhorIe1UnX-3KN6aG9BmIsY2PC6LYqvauEpp38VZWBiJPsH976jV4rFIjWYY6Vn7i9ftX7-ep5kaAdbqaLphRgdEcJ-Jub8lJ80TefGtr-f5xRuydNarxJ2vY9QdHg/s1600/001.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEimoBhRBPhdqvIZXZbWLOJwpQ7s8oPgNzY26vEOE-rTPpq5OyShgQf_DRDNInJQHZ-09Jl5Z_c811Qo0hz4qo3kcJiKWnyY9PdL3U7Ovayjv2ZodqLNsI3fmHloF652BgFVUOaSgrvAVu8/s1600/002.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEimnmNciSuKLA7ykBGFs65u4yEE1u3aAbUlOSMpR4XfjeqX0zQrrsfSsex8214i4frhaF0dIMsRdltMz5bQTP16QOihaWofC3z9xG5LsfSvWfO8mNH1vfCWfpNcyLyQ_BIgj6dyiQmxwE0/s1600/003.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;320&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEimnmNciSuKLA7ykBGFs65u4yEE1u3aAbUlOSMpR4XfjeqX0zQrrsfSsex8214i4frhaF0dIMsRdltMz5bQTP16QOihaWofC3z9xG5LsfSvWfO8mNH1vfCWfpNcyLyQ_BIgj6dyiQmxwE0/s320/003.png&quot; width=&quot;307&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhEbScQpWwGdxOVFDvimNe0ippEaEw7uKbOjtW_I_P4013SExXZ2V2hW4MdKqDlsayGoM_BUsul8rpPRa11_QkSE8HO8eAnqZggDmkSW9SoeL7DBLtC7DkxcwPbwHGtCF1eHp6fuBHNZaU/s1600/004.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;268&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhEbScQpWwGdxOVFDvimNe0ippEaEw7uKbOjtW_I_P4013SExXZ2V2hW4MdKqDlsayGoM_BUsul8rpPRa11_QkSE8HO8eAnqZggDmkSW9SoeL7DBLtC7DkxcwPbwHGtCF1eHp6fuBHNZaU/s320/004.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhriY7gizUdce9gy4y5HupqjAd40gdfnc2_auz6Cu-CitMWoetq8f_YgdY_n0yFhaDN5Z8HqPmWbhUUr-JzMbQPo-fiO_pmGdWpDnKjjOfELwU4oIVYarZS7Hgl1r_pVZsTcSR6vY__Csg/s1600/005.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;320&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhriY7gizUdce9gy4y5HupqjAd40gdfnc2_auz6Cu-CitMWoetq8f_YgdY_n0yFhaDN5Z8HqPmWbhUUr-JzMbQPo-fiO_pmGdWpDnKjjOfELwU4oIVYarZS7Hgl1r_pVZsTcSR6vY__Csg/s320/005.png&quot; width=&quot;309&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg039tolqnpMjo5fk9b5sLyKVXe1uxNPT2R7x4MSTJ4vaMmOhoPxGns-ANEbDbWZ8JpDqLGk_VMwLhEEKgEuDZUJjBfuVLHOfk24OFk0XeoE5DPS90oOWeAhBlUPM5CIw8b0-fGj28mg-4/s1600/006.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;320&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg039tolqnpMjo5fk9b5sLyKVXe1uxNPT2R7x4MSTJ4vaMmOhoPxGns-ANEbDbWZ8JpDqLGk_VMwLhEEKgEuDZUJjBfuVLHOfk24OFk0XeoE5DPS90oOWeAhBlUPM5CIw8b0-fGj28mg-4/s320/006.png&quot; width=&quot;275&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhoyTSGhLYuAlcvy-6OFhg6BkE6wtgkI_5d4prTJSZO_x1U-tcmR1cJFA29wfB1D8vfurlsWkWwbLO_wbs0EWbVABHf7h-LFaS1A5nSBGbnEZmk1-ew_riZoVeS-j42QWsrfMAjpHQA7RA/s1600/007.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;320&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhoyTSGhLYuAlcvy-6OFhg6BkE6wtgkI_5d4prTJSZO_x1U-tcmR1cJFA29wfB1D8vfurlsWkWwbLO_wbs0EWbVABHf7h-LFaS1A5nSBGbnEZmk1-ew_riZoVeS-j42QWsrfMAjpHQA7RA/s320/007.png&quot; width=&quot;253&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEja4dssBKMcc9TJ5IYupEKNhiCSGDlQNnxNWSJAy0OJ9p1uOtZ9uIMzUPwbS5dUgXsu45ZrWbVDaMgz8rvGriXadGBWz3r00oqznhhcMdeBG-bUFTJ7WXDF2SMGhH5jUnnOosZKJeRrXLo/s1600/008.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;296&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEja4dssBKMcc9TJ5IYupEKNhiCSGDlQNnxNWSJAy0OJ9p1uOtZ9uIMzUPwbS5dUgXsu45ZrWbVDaMgz8rvGriXadGBWz3r00oqznhhcMdeBG-bUFTJ7WXDF2SMGhH5jUnnOosZKJeRrXLo/s320/008.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Dream On!</description><link>http://bytescrolls.blogspot.com/2012/04/hadoop-meetup-inmobi-bangalore.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEimnmNciSuKLA7ykBGFs65u4yEE1u3aAbUlOSMpR4XfjeqX0zQrrsfSsex8214i4frhaF0dIMsRdltMz5bQTP16QOihaWofC3z9xG5LsfSvWfO8mNH1vfCWfpNcyLyQ_BIgj6dyiQmxwE0/s72-c/003.png" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-5220825718601396986</guid><pubDate>Thu, 08 Mar 2012 18:40:00 +0000</pubDate><atom:updated>2012-03-09T00:35:32.853+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">big data</category><category domain="http://www.blogger.com/atom/ns#">database</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">hive</category><category domain="http://www.blogger.com/atom/ns#">indexing</category><category domain="http://www.blogger.com/atom/ns#">java</category><category domain="http://www.blogger.com/atom/ns#">programming</category><category domain="http://www.blogger.com/atom/ns#">tech</category><category domain="http://www.blogger.com/atom/ns#">technology</category><title>Creating index in Hive</title><description>&lt;div class=&quot;posterous_autopost&quot;&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;http://hive.apache.org/images/hive_logo_medium.jpg&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;http://hive.apache.org/images/hive_logo_medium.jpg&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Simple:&lt;br /&gt;
&lt;blockquote class=&quot;posterous_short_quote&quot;&gt;
CREATE INDEX idx ON TABLE tbl(col_name) AS &#39;Index_Handler_QClass_Name&#39; IN TABLE tbl_idx;&lt;/blockquote&gt;
As to make pluggable indexing algorithms, one has to mention the associated class name that handles indexing say for eg:-&lt;a href=&quot;http://hive.apache.org/docs/r0.7.0/api/org/apache/hadoop/hive/ql/index/compact/CompactIndexHandler.html&quot;&gt;org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler&lt;/a&gt;&lt;br /&gt;
The index handler classes implement &lt;a href=&quot;http://hive.apache.org/docs/r0.7.0/api/org/apache/hadoop/hive/ql/index/HiveIndexHandler.html&quot;&gt;HiveIndexHandler&lt;/a&gt;&lt;br /&gt;
Full Syntax:&lt;br /&gt;
&lt;blockquote class=&quot;posterous_medium_quote&quot;&gt;
CREATE INDEX index_name &lt;br /&gt;
ON TABLE base_table_name (col_name, ...)&lt;br /&gt;
AS &#39;index.handler.class.name&#39;&lt;br /&gt;
[WITH DEFERRED REBUILD]&lt;br /&gt;
[IDXPROPERTIES (property_name=property_value, ...)]&lt;br /&gt;
[IN TABLE index_table_name]&lt;br /&gt;
[PARTITIONED BY (col_name, ...)]&lt;br /&gt;
[&lt;br /&gt;
&amp;nbsp;&amp;nbsp; [ ROW FORMAT ...] STORED AS ...&lt;br /&gt;
&amp;nbsp;&amp;nbsp; | STORED BY ...&lt;br /&gt;
]&lt;br /&gt;
[LOCATION hdfs_path]&lt;br /&gt;
[TBLPROPERTIES (...)]&lt;br /&gt;
[COMMENT &quot;index comment&quot;]&lt;/blockquote&gt;
&lt;ul&gt;
&lt;li&gt;WITH DEFERRED REBUILD - for newly created index is initially empty. REBUILD can be used to make the index up to date.&lt;/li&gt;
&lt;/ul&gt;
&lt;ul&gt;
&lt;li&gt;IDXPROPERTIES/TBLPROPERTIES - declaring keyspace properties &lt;/li&gt;
&lt;/ul&gt;
&lt;ul&gt;
&lt;li&gt;PARTITIONED BY - table columns where in the index get partitioned, if not specified index spans all table partitions&lt;/li&gt;
&lt;/ul&gt;
&lt;ul&gt;
&lt;li&gt;ROW FORMAT&amp;nbsp; - custom SerDe or using native &lt;a href=&quot;https://cwiki.apache.org/confluence/display/Hive/SerDe&quot;&gt;SerDe&lt;/a&gt;(Serializer/Deserializer for Hive read/write). A native SerDe is used if ROW FORMAT is not specified &lt;/li&gt;
&lt;/ul&gt;
&lt;ul&gt;
&lt;li&gt;STORED AS&amp;nbsp; - index table storage format like &lt;a href=&quot;http://en.wikipedia.org/wiki/RCFile%20&quot;&gt;RCFILE &lt;/a&gt;or SEQUENCFILE.The user has to uniquely specify tabl_idx name is required for a qualified index name across tables, otherwise they are named automatically. STORED BY - can be HBase (I haven&#39;t tried it)&lt;/li&gt;
&lt;/ul&gt;
&lt;br /&gt;
The index can be stored in hive table or as RCFILE in an hdfs path etc. In this case, the implemented&amp;nbsp; index handler class usesIndexTable() method will return false.When index is created, the generateIndexBuildTaskList(...) in index handler class will generate a plan for building the index.&lt;br /&gt;
&lt;br /&gt;
Consider CompactIndexHandler from Hive distribution,&lt;br /&gt;
&lt;br /&gt;
It&amp;nbsp; only stores the addresses of HDFS blocks containing that value. The index is stored in hive metastore FieldSchema as _bucketname and _offsets in the index table.&lt;br /&gt;
&lt;br /&gt;
ie the index table contains 3 columns, with _unparsed_column_names_from_field schema (indexed columns), _bucketname(table partition hdfs file having columns),[&quot; _blockoffsets&quot;,...&quot;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
See the code from CompactIndexHandler,&lt;br /&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/2002551.js&quot;&gt;
 
&lt;/script&gt;
&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2012/03/creating-index-in-hive.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>19</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-3017890108747476052</guid><pubDate>Tue, 06 Mar 2012 11:56:00 +0000</pubDate><atom:updated>2012-03-06T17:26:06.958+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">hadoop</category><title>What&#39;s it about Cascading?</title><description>&lt;table cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;float: left; margin-right: 1em; text-align: left;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://www.rapleaf.com/images/developers/open_source/cascading.gif&quot; imageanchor=&quot;1&quot; style=&quot;clear: left; margin-bottom: 1em; margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;200&quot; src=&quot;https://www.rapleaf.com/images/developers/open_source/cascading.gif&quot; width=&quot;200&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;&lt;br /&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
&lt;br /&gt;
Cascading helps manipulating data in &lt;a href=&quot;http://hadoop.apache.org/&quot;&gt;Hadoop&lt;/a&gt;. It is a framework written in Java which abstracts map reduce that allows to write scripts to read and modify data inside Hadoop. Provides a programming API for defining and executing fault tolerant data processing workflows and a query processing API in which the developers can go without map reduce. There are quite a number of DSLs built on top of Cascading, most noteably &lt;a href=&quot;https://github.com/nathanmarz/cascalog&quot;&gt;Cascalog &lt;/a&gt;(written in Clojure) and &lt;a href=&quot;https://github.com/twitter/scalding&quot;&gt;Scalding &lt;/a&gt;(written in Scala). There is &lt;a href=&quot;http://pig.apache.org/&quot;&gt;Pig &lt;/a&gt;data processing API which is similar but SQLy. &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Terminology&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Taps &lt;/i&gt;- streams of source (input) and sink (output)&lt;br /&gt;&lt;i&gt;Tuple &lt;/i&gt;- can be considered as a result set. This is a single row with named columns of data being processed. A series of tuples make a &lt;i&gt;stream&lt;/i&gt;.All tuples in a stream have the exact same fields.&lt;br /&gt;&lt;i&gt;Pipes &lt;/i&gt;- tie operations together when executed upon a Tap. &lt;i&gt;Pipe Assembly&lt;/i&gt; is created when pipes are successuvely executed.Pipe assemblies are Directed Acyclic Graphs.&lt;br /&gt;&lt;i&gt;Flows &lt;/i&gt;- reusable combinations of source,sink and pipe assemblies.&lt;br /&gt;&lt;i&gt;Cascade &lt;/i&gt;- series of flows&lt;br /&gt;&lt;br /&gt;&lt;b&gt;What all operations possible?&amp;nbsp; &lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Relational &lt;/i&gt;- Join, Filter, Aggregate etc&lt;br /&gt;&lt;i&gt;Each &lt;/i&gt;- for each row result (tuple)&lt;br /&gt;&lt;i&gt;Group &lt;/i&gt;- Groupby&lt;br /&gt;&lt;i&gt;CoGroup &lt;/i&gt;- joins for tuples&lt;br /&gt;&lt;i&gt;Every &lt;/i&gt;- for every key in group or cogroup, like an aggregate function to all tuples in a group at once&lt;br /&gt;&lt;i&gt;SubAssembly &lt;/i&gt;- nesting reusable pipe assemblies into a Pipe&lt;br /&gt;&lt;br /&gt;Internally the cascading employs an intelligent planner to convert the pipe assembly to a graph of dependent MapReduce jobs that can be executed on a Hadoop cluster. &lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
What are the advantages from a normal map reduce workflow do this Cascading have? (Need to investigate!)&lt;br /&gt;</description><link>http://bytescrolls.blogspot.com/2012/03/whats-it-about-cascading.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-7661555983637547632</guid><pubDate>Thu, 01 Mar 2012 17:16:00 +0000</pubDate><atom:updated>2012-03-01T22:46:00.117+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">cybernetics</category><category domain="http://www.blogger.com/atom/ns#">prometheus</category><category domain="http://www.blogger.com/atom/ns#">ted</category><title>O Blimey! TED Talk 2023</title><description>&lt;div class=&#39;posterous_autopost&#39;&gt;&lt;p&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;em&gt;&lt;span&gt;&lt;a href=&quot;http://www.imdb.com/title/tt1446714/&quot; target=&quot;_blank&quot;&gt;Prometheus&lt;/a&gt; &lt;/span&gt;&lt;/em&gt;&lt;span&gt;&lt;em&gt;&amp;nbsp;&lt;/em&gt;&lt;/span&gt;film, going viral... like a fire that danced at the end of the match.&lt;/p&gt;  &lt;p&gt;Aha! cybernetic life-forms...&lt;/p&gt;  &lt;blockquote&gt;  &lt;p&gt;The only &quot;purpose&#39;&#39; (in the biological sense) of this identity is to preserve its own existence in time, that is to survive in current, specific environmental conditions, as well as to produce as many copies of itself as possible. The entire network of negative feedback mechanisms is ultimately directed at the latter task. Within the cybernetic paradigm, however, reproduction is nothing but a positive feedback.&lt;/p&gt;  &lt;p&gt;&amp;nbsp;-from &lt;a href=&quot;http://scifunam.fisica.unam.mx/mir/defilife.pdf&quot; target=&quot;_blank&quot;&gt;Cybernetic Formulation of the Defnition of Life&lt;/a&gt;&lt;/p&gt;  &lt;/blockquote&gt;  &lt;p&gt;&amp;nbsp;&lt;/p&gt;  &lt;p&gt;&lt;iframe src=&quot;http://www.youtube.com/embed/GROrp3XBRrE&quot; frameborder=&quot;0&quot; height=&quot;315&quot; width=&quot;560&quot;&gt;&lt;/iframe&gt;&lt;/p&gt;  &lt;p&gt;&lt;img alt=&quot;&quot; style=&quot;border: 0px solid blue;&quot; /&gt;&lt;/p&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2012/03/o-blimey-ted-talk-2023.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://img.youtube.com/vi/GROrp3XBRrE/default.jpg" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-6094269740081983471</guid><pubDate>Tue, 28 Feb 2012 21:23:00 +0000</pubDate><atom:updated>2012-02-29T02:53:20.489+05:30</atom:updated><title>Tinker, Tailor, Soldier, Spy and The Perspicacious &amp;quot;Collusion&amp;quot;</title><description>&lt;div class=&#39;posterous_autopost&#39;&gt;&lt;p&gt;Collusion!&lt;/p&gt;  &lt;blockquote class=&quot;posterous_short_quote&quot;&gt;  &lt;p&gt;A secret agreement between two or more parties for a fraudulent, illegal, or deceitful purpose.&lt;/p&gt;  &lt;/blockquote&gt;  &lt;p&gt;In this battleground of privacy wars and illusionary consumer willpower, there comes another wizard to show you the goblins who steal your data.. &lt;a href=&quot;http://www.mozilla.org/en-US/collusion/demo/&quot; target=&quot;_blank&quot;&gt;Collusion from Mozilla&lt;/a&gt;.&lt;/p&gt;  &lt;p&gt;&lt;div class=&#39;p_embed p_image_embed&#39;&gt; &lt;a href=&quot;http://getfile5.posterous.com/getfile/files.posterous.com/temp-2012-02-28/mgDGmrfknefBakdgkulaccbspbJasclcIEfqnnJjbCeoGEmIAipczxqBfIJJ/collusion.png.scaled1000.png&quot;&gt;&lt;img alt=&quot;Collusion&quot; height=&quot;500&quot; src=&quot;http://getfile2.posterous.com/getfile/files.posterous.com/temp-2012-02-28/mgDGmrfknefBakdgkulaccbspbJasclcIEfqnnJjbCeoGEmIAipczxqBfIJJ/collusion.png.scaled500.png&quot; width=&quot;500&quot; /&gt;&lt;/a&gt; &lt;/div&gt; &lt;/p&gt;  &lt;blockquote class=&quot;posterous_medium_quote&quot;&gt;  &lt;p&gt;Collusion is an experimental add-on for Firefox and allows you to     see all the third parties that are tracking your movements across     the Web. It will show, in real time, how that data creates a     spider-web of interaction between companies and other trackers.&lt;/p&gt;  &lt;/blockquote&gt;  &lt;p&gt;Oh yeah, thanks mozilla for helping us to find the hooligans steal our cookies! Yeah we can now haplessly stare at the &lt;a href=&quot;http://privacychoice.org/companies/all&quot; target=&quot;_blank&quot;&gt;red devils&lt;/a&gt; and haloing thieves&lt;/p&gt;  &lt;p&gt;What the heck! We don&#39;t have time for tracking everything in our life. Anyway, the stuff looks cool... collusion, interesting word.&lt;/p&gt;  &lt;p&gt;&lt;img alt=&quot;&quot; style=&quot;border: 0px solid blue;&quot; /&gt;&lt;/p&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2012/02/tinker-tailor-soldier-spy-and.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-7715276159817365079</guid><pubDate>Tue, 28 Feb 2012 17:34:00 +0000</pubDate><atom:updated>2012-02-28T23:04:12.550+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">big data</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">semantic</category><title>The mythical unstructured data!</title><description>&lt;div class=&#39;posterous_autopost&#39;&gt;&lt;p&gt;As semantic web and big data integration gaining its &lt;a href=&quot;http://www.youtube.com/watch?v=0weD8iBsmqM&amp;amp;feature=related&quot; target=&quot;_blank&quot;&gt;fus-ro-dah&lt;/a&gt;, enterprises are finding a way to harness any available form of information swarming the web and the world&lt;/p&gt;  &lt;p&gt;I came across some interesting artcles which gives a concise idea of harnessing metadata from unstructured data....&lt;/p&gt;  &lt;p&gt;&lt;a href=&quot;http://bigmenoncontent.com/2010/09/21/the-myth-of-unstructured-data/&quot; target=&quot;_blank&quot;&gt;&lt;em class=&quot;info&quot;&gt;Lee Dallas&lt;/em&gt;&lt;/a&gt; says&lt;/p&gt;  &lt;blockquote class=&quot;posterous_medium_quote&quot;&gt;  &lt;p&gt;In some respects it&amp;nbsp;is analogous to hieroglyphics where&amp;nbsp;pictographs  carry abstract meaning.&amp;nbsp; The data may not be easily interpretable by  machines but document recognition and capture technologies improve  daily. The fact that an error rate still exists in recognition does not  mean that the content lacks structure.&amp;nbsp;&amp;nbsp;Simply that the form it takes is  too complex for simple processes to understand.&lt;/p&gt;  &lt;/blockquote&gt;  &lt;p&gt;more here : &lt;a href=&quot;http://bigmenoncontent.com/2010/09/21/the-myth-of-unstructured-data/&quot; target=&quot;_blank&quot;&gt;http://bigmenoncontent.com/2010/09/21/the-myth-of-unstructured-data/&lt;/a&gt;&lt;/p&gt;  &lt;p&gt;&lt;span class=&quot;post-author vcard&quot;&gt; &lt;span class=&quot;fn&quot;&gt; &lt;a href=&quot;https://profiles.google.com/116955183943966626962&quot; title=&quot;author profile&quot; rel=&quot;author&quot;&gt; &lt;/a&gt;&lt;em&gt;&lt;a href=&quot;http://www.linkedin.com/in/ramsub&quot; target=&quot;_blank&quot;&gt;Ram Subramanyam Gopalan&lt;/a&gt; &lt;/em&gt;says&lt;em&gt;&lt;br /&gt;&lt;/em&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;  &lt;blockquote class=&quot;posterous_medium_quote&quot;&gt;  &lt;p&gt;A lot of data growth is happening around these so-called unstructured  data types. Enterprises which manage to automate the collection,  organization and analysis of these data types, will derive competitive  advantage.&lt;br /&gt; Every data element does mean something, though what it means may not always be relevant for you.&lt;/p&gt;  &lt;/blockquote&gt;  &lt;p&gt;more here : &lt;a href=&quot;http://bigdataintegration.blogspot.in/2012/02/unstructured-data-is-myth.html&quot;&gt;http://bigdataintegration.blogspot.in/2012/02/unstructured-data-is-myth.html&lt;/a&gt;&lt;/p&gt;  &lt;p&gt;&amp;nbsp;&lt;/p&gt;  &lt;p&gt;&amp;nbsp;&lt;/p&gt;  &lt;p&gt;&lt;img alt=&quot;&quot; style=&quot;border: 0px solid blue;&quot; /&gt;&lt;/p&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2012/02/mythical-unstructured-data.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-4492126112876800994</guid><pubDate>Mon, 27 Feb 2012 11:56:00 +0000</pubDate><atom:updated>2012-02-27T17:26:53.378+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">distributed systems</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">nosql</category><title>Consistent Hashing</title><description>What is a consistent hash function? &lt;br /&gt;
&lt;br /&gt;
A consistent hash function is one which changes minimally as the
range of function changes.&lt;br /&gt;
&lt;br /&gt;
What&#39;s the advantage of such functions?&lt;br /&gt;
&lt;br /&gt;
This is ideal when set of buckets change over time. Two users with
inconsistent but overlapping sets of buckets will map items to the
same bucket with high probability. So this eliminates the need of
&quot;maintaining&quot; a consistent &quot;state&quot; among all
nodes in a network. The algorithm can be used for making consistent
assignments or relationships between different sets of data in such a
way that if we add or remove items, the algorithm can be recalculated
on any machine and produce the same results.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Theory&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
A view V is a set of buckets where user is aware. A client uses a
consistent hash function,&lt;i&gt; f(V,i)&lt;/i&gt;, maps an object to one of the
buckets in the view. Say, assign each of hash buckets to random
points on mod &lt;i&gt;2^n&lt;/i&gt; circle (virtually!) where hash
key &lt;i&gt;size = n.&lt;/i&gt; The hash of object= closest clockwise bucket. These
small sets of buckets lie near the object. In this case, all the
buckets get roughly same number of items. When &lt;i&gt;kth &lt;/i&gt;bucket is added
only a &lt;i&gt;1/k&lt;/i&gt; fraction of items move. This means when new node is added
only minimum reshuffle is needed, which is the advantage of having a
view. There can be a hash structure for the key lookup (a balanced
tree) which stores the hash of all nodes (in the view).&amp;nbsp; When a
new node is added its hash value is added to the table.&lt;br /&gt;
&lt;br /&gt;
Suppose there are two nodes A and B three objects 1–3 (mapped to
a hash-function’s result range). The objects 3 and 1 are mapped to
node A, object 2 to node B. When a node leaves the system, data will
get mapped to their adjacent node (in clockwise direction) and when a
node enters the system it will get hashed onto the ring and will
overtake objects.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhPVa8ENOeklQ4aUU3rlpcRfAJ9YDAAPt65kkHwT5ey_TcAGWeAf4fNznPRzAdwoVAPjhClIpIwBgzvnEX8k8vA8gIHZer2djRj_GwwG4S8sZz-0Cbu1_N77rXFye0jspYUrbuMbjIxtJg/s1600/consistent+hashing.PNG&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhPVa8ENOeklQ4aUU3rlpcRfAJ9YDAAPt65kkHwT5ey_TcAGWeAf4fNznPRzAdwoVAPjhClIpIwBgzvnEX8k8vA8gIHZer2djRj_GwwG4S8sZz-0Cbu1_N77rXFye0jspYUrbuMbjIxtJg/s1600/consistent+hashing.PNG&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
As an example, (refer &lt;a href=&quot;http://www.tomkleinpeter.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/&quot;&gt;link1&lt;/a&gt;, &lt;a href=&quot;http://www.paperplanes.de/2011/12/9/the-magic-of-consistent-hashing.html&quot;&gt;link2&lt;/a&gt;), the circle denotes a range of&amp;nbsp; key values. Say, the points in circle represents 64 bit numbers. Hash the data to get the 64 bit number, which is a point in the circle. Take the IPs of nodes and hash them into 64 bit number and point in the circle. Associate the data to the nodes in the clockwise direction (ie. closest, which can be retrieved from the node in the hash structure).&amp;nbsp; When a new node is inserted into the hash tree, the data will always be assigned to the closest one only. Everything between this number and one that&#39;s next in the ring and that has
been picked by a different node previously, is now belong to this node.&lt;br /&gt;
&lt;br /&gt;
The basic idea of consistent hash function is to hash both objects
and buckets using the same function. It&#39;s one of the best ways to
implement APIs that can dynamically scale out and rebalanced. The
client applications can calculate which node to contact in order to
request or write the data with no metadata server required.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Used by&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://memcached.org/&quot;&gt;memcached&lt;/a&gt; cluster.&lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;i&gt;Typically, multiple memcached daemons are started, on different hosts. The clients are passed a list of memcached addresses (IP address and port) and pick one daemon for a given key. This is done via consistent hashing, which always maps the same key K to the same memcached server S. When a server crashes, or a new server is added, consistent hashing makes sure that the ensuing rehashing is minimal. Which means that most keys still map to the same servers, but keys hashing to a removed server are rehashed to a new server.&lt;/i&gt; - from &lt;a href=&quot;http://www.jgroups.org/memcached/memcached.pdf&quot;&gt;A memcached implementation in JGroups&lt;/a&gt;&lt;/blockquote&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://en.wikipedia.org/wiki/Dynamo_%28storage_system%29&quot;&gt;Amazon&#39;s Dynamo &lt;/a&gt;uses consistent hashing along with replication as a partitioning scheme. &lt;br /&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;i&gt;Data is partitioned and replicated using consistent hashing [10], and consistency is facilitated by object versioning [12]. The consistency among replicas during updates is maintained by a quorum-like technique and a decentralized replica synchronization protocol. &lt;/i&gt;- from &lt;a href=&quot;http://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf&quot;&gt;Dynamo: Amazon&#39;s Highly Available Key-value Store&lt;/a&gt;&lt;/blockquote&gt;
&lt;br /&gt;
Data of a &lt;a href=&quot;http://cassandra.apache.org/&quot;&gt;Cassandra&lt;/a&gt; table gets partitioned and distributed among
the nodes by a consistent hashing function.&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;i&gt;Cassandra partitions data across the cluster using consistent hashing [11] but uses an order preserving hash function to do so. In consistent hashing the output range of a hash function is treated as a  circular space or &quot;ring&quot; (i.e. the largest hash value wraps around to the smallest hash value). Each node in the system is as-signed a random value within this space which represents its position on the ring. Each data item identified by a key is assigned to a node by hashing the data item&#39;s key to yield its position on the ring, and then walking the ring clockwise to fi nd the  first node with a position larger than the item&#39;s position. This node is deemed the coordinator for this key. The application specifi es this key and the Cassandra uses it to route requests. Thus, each node becomes responsible for the region in the ring between it and its predecessor node on the ring. The principal advantage of consistent hashing is that departure or arrival of a node only aff ects its immediate neighbors and other nodes remain una ffected.&lt;/i&gt; - from&amp;nbsp; &lt;a href=&quot;http://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf&quot;&gt;Cassandra - A Decentralized Structured Storage System&lt;/a&gt;&lt;/blockquote&gt;
&amp;nbsp;&lt;a href=&quot;http://project-voldemort.com/&quot;&gt;Voldemort &lt;/a&gt;automatic sharding&amp;nbsp; of&amp;nbsp; data. Nodes&amp;nbsp; can&amp;nbsp;
be added&amp;nbsp; or&amp;nbsp; removed&amp;nbsp; from&amp;nbsp; a&amp;nbsp; database&amp;nbsp;
cluster,&amp;nbsp; and&amp;nbsp; the system adapts automatically.&amp;nbsp;
Voldemort automatically detects and recovers failed nodes. [&lt;a href=&quot;http://qconsf.com/dl/qcon-sanfran-2009/slides/JayKreps_ProjectVoldemortScalingSimpleStorageAtLinkedIn.pdf&quot;&gt;refer&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;b&gt;References:&lt;/b&gt;&lt;br /&gt;
&lt;a href=&quot;http://www.akamai.com/dl/technical_publications/ConsistenHashingandRandomTreesDistributedCachingprotocolsforrelievingHotSpotsontheworldwideweb.pdf&quot;&gt;http://www.akamai.com/dl/technical_publications/ConsistenHashingandRandomTreesDistributedCachingprotocolsforrelievingHotSpotsontheworldwideweb.pdf&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;http://sharplearningcurve.com/blog/2010/09/27/consistent-hashing/&quot;&gt;http://sharplearningcurve.com/blog/2010/09/27/consistent-hashing/&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;http://weblogs.java.net/blog/tomwhite/archive/2007/11/consistent_hash.html&quot;&gt;http://weblogs.java.net/blog/tomwhite/archive/2007/11/consistent_hash.html&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;margin-bottom: 0in;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2012/02/consistent-hashing.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhPVa8ENOeklQ4aUU3rlpcRfAJ9YDAAPt65kkHwT5ey_TcAGWeAf4fNznPRzAdwoVAPjhClIpIwBgzvnEX8k8vA8gIHZer2djRj_GwwG4S8sZz-0Cbu1_N77rXFye0jspYUrbuMbjIxtJg/s72-c/consistent+hashing.PNG" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-2722553167571766309</guid><pubDate>Sun, 26 Feb 2012 17:16:00 +0000</pubDate><atom:updated>2012-02-26T22:46:56.084+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">map reduce</category><category domain="http://www.blogger.com/atom/ns#">parallel</category><title>About Bulk Synchronous Parallel(BSP) model</title><description>As an alternative to mapreduce paradigm, there is another parallel computing model&amp;nbsp; called Bulk Synchronous Parallel(BSP). A BSP computer is defined as a set of processors with local memory, interconnected by a communication mechanism (e. g., a network or shared memory) capable of point-to-point communication, and a barrier synchronization mechanism. It differentiates/decouples the use of local memory from that of remote memory. A BSP program consists of a set of BSP processes and a sequence of super-steps—time intervals bounded by the barrier synchronization. Each processor has its own local memory module, and all other memories are non-local where they are accessed by networking. The communication between processors are non-blocking.The essence of the BSP model is super-step. At the start of super step computations are done locally. Then, using the messaging system in the network, the other processes can handle requests for further computation.The communication and synchronization are decoupled. There exists a barrier synchronization in which the processors wait and sync when all communications are completed. When all processes have invoked the sync method and all messages are delivered, the next super-step begins. Then the messages sent during the previous super-step can be accessed by its recipients.The data locality is an inherent part of this model in which the communication is made only when the peer data in necessary. This is different from mapreduce frameworks in which they do not preserve data locality in consecutive operations. During mapreduce processing, it generally passes input data through either many passes of mapreduce or mapreduce iteration in order to derive final results which makes communication cost added on to the processing cost. So BSP is useful with many programs requiring iterations and recursions.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi84lB-WJNRANmYY1JcdDukmj5yQZyfjefcxclpALsxgDSGWS4iQ01lNJH8Ezb8ipzsNOtFYUeqsDRpLkYEPIk3gN0VpyniBATDzpSYK0oV_bCFzD9netoFFYcrd9dgbIzyAWHrP8OihNU/s1600/bsp.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi84lB-WJNRANmYY1JcdDukmj5yQZyfjefcxclpALsxgDSGWS4iQ01lNJH8Ezb8ipzsNOtFYUeqsDRpLkYEPIk3gN0VpyniBATDzpSYK0oV_bCFzD9netoFFYcrd9dgbIzyAWHrP8OihNU/s1600/bsp.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
Apache Hama&amp;nbsp; is one such project enabling hadoop to leverage BSP. Google &lt;a href=&quot;http://googleresearch.blogspot.in/2009/06/large-scale-graph-computing-at-google.html?z&quot;&gt;Pregel &lt;/a&gt;uses BSP for large scale mining of graphs.&lt;br /&gt;
&lt;br /&gt;
reference:&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://en.wikipedia.org/wiki/Bulk_synchronous_parallel&quot;&gt;http://en.wikipedia.org/wiki/Bulk_synchronous_parallel&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;http://incubator.apache.org/hama/&quot;&gt;http://incubator.apache.org/hama/&lt;/a&gt;&lt;br/&gt;&lt;br/&gt;</description><link>http://bytescrolls.blogspot.com/2012/02/about-bulk-synchronous-parallelbsp.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi84lB-WJNRANmYY1JcdDukmj5yQZyfjefcxclpALsxgDSGWS4iQ01lNJH8Ezb8ipzsNOtFYUeqsDRpLkYEPIk3gN0VpyniBATDzpSYK0oV_bCFzD9netoFFYcrd9dgbIzyAWHrP8OihNU/s72-c/bsp.png" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-3294630419062203864</guid><pubDate>Fri, 15 Jul 2011 17:42:00 +0000</pubDate><atom:updated>2011-07-15T23:12:16.677+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">algorithms</category><title>Boyer and Moore&#39;s Linear Time Voting Algorithm</title><description>This is a simple linear time voting algorithm designed by Robert S Boyer and J Stother Moore in 1980 which is discussed in their paper &lt;i&gt;&lt;a href=&quot;http://www.cs.utexas.edu/users/boyer/mjrty.ps.Z&quot;&gt;MJRTY - A Fast Majority Vote Algorithm&lt;/a&gt;&lt;/i&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;This algorithm decides which element of a sequence is in the majority, provided there is such an element.&amp;nbsp;&lt;/div&gt;&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;Suppose there are n characters (objects/candidates). When the ith element&amp;nbsp;is visited, the set can be divided into two groups,ca group of k elements in favor of current selected candidate and&amp;nbsp;a group of elements that disagree.After processing all, we can conclude&amp;nbsp;that candidate selected can be considered majority if there&#39;s any !&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
When the pointer forward over an element e:&lt;br /&gt;
&lt;br /&gt;
If the counter is 0, we set the current candidate to e and we set the counter to 1.&lt;br /&gt;
If the counter is not 0, we increment or decrement the counter according to whether e is the current candidate.&lt;br /&gt;
When we are done, the current candidate is the majority element, if there is a majority.&lt;br /&gt;
&lt;br /&gt;
I have written a simple java implementation.&lt;br /&gt;
&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/1085114.js&quot;&gt; &lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
Sometime ties may occur. But this algorithm doesn&#39;t fit as the solution.For an assurance, if the vote is greater than n/2, the candidate which is returned&amp;nbsp;as majority it is announced to be the selected one. This counting&amp;nbsp;phase can be done when the increment for the candidate happens.This algorithm is really effective when the data is read from a tape.The algorithm only works when at least half of the elements constitute&amp;nbsp;the majority.&lt;br /&gt;
&lt;br /&gt;
Reference&lt;br /&gt;
&lt;a href=&quot;http://www.cs.utexas.edu/users/moore/best-ideas/mjrty/example.html&quot;&gt;http://www.cs.utexas.edu/users/moore/best-ideas/mjrty/example.html&lt;/a&gt;</description><link>http://bytescrolls.blogspot.com/2011/07/boyer-and-moores-linear-time-voting.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-7021673409515476499</guid><pubDate>Thu, 14 Jul 2011 15:38:00 +0000</pubDate><atom:updated>2012-02-28T02:11:49.646+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">java</category><title>Descending Iterator and Adapter pattern</title><description>There is a descending iterator in linked list implementation in Java SDK. A humble private class in LinkedList. A good example of adapter.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg8Lg8Ur0uKacImLomYHz3lOuZNcbJ0aOv93pfdHb0OKr1IN05UTYtNuKvMWIjkRgplHoDM3Y3pfH1HbJnR0CF2hyphenhyphen5hlvC6NXqwslrj5cNV16SkZ7g2yWpT5aR83Os8SwcMfP28JWrCqX0/s1600/300px-ClassAdapter.png&quot; /&gt;&lt;/div&gt;&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/1082669.js?file=link-1.java&quot;&gt;
&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
calls up &lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
public Iterator&amp;lt;E&amp;gt; descendingIterator() {&lt;br /&gt;
return new DescendingIterator();&lt;br /&gt;
}&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/1082671.js?file=desc.java&quot;&gt;
&lt;/script&gt;</description><link>http://bytescrolls.blogspot.com/2011/07/descending-iterator.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg8Lg8Ur0uKacImLomYHz3lOuZNcbJ0aOv93pfdHb0OKr1IN05UTYtNuKvMWIjkRgplHoDM3Y3pfH1HbJnR0CF2hyphenhyphen5hlvC6NXqwslrj5cNV16SkZ7g2yWpT5aR83Os8SwcMfP28JWrCqX0/s72-c/300px-ClassAdapter.png" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-5952776264752145965</guid><pubDate>Mon, 30 May 2011 12:42:00 +0000</pubDate><atom:updated>2011-05-30T20:14:31.212+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">avro</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">java</category><category domain="http://www.blogger.com/atom/ns#">programming</category><title>Using Avro to serialize logs in log4j</title><description>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj7WdyNoieN9hX9WEt2LetQuKIJtGk4jvgoUAoHh6-j1nXqIHiaY6Zj0aH3iF9h-xoTpd9t1DjB3-eH2Q8UlOonc9RMNpIZ1YgpOlhr5s2eqfezWlyHx6XJonvNGWHU2aFn-x7EWFmuiPg/s1600/logos.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;89&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj7WdyNoieN9hX9WEt2LetQuKIJtGk4jvgoUAoHh6-j1nXqIHiaY6Zj0aH3iF9h-xoTpd9t1DjB3-eH2Q8UlOonc9RMNpIZ1YgpOlhr5s2eqfezWlyHx6XJonvNGWHU2aFn-x7EWFmuiPg/s320/logos.png&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;I have written about serialization mechanism of Protocol Buffers previously.&amp;nbsp;Similarly,&amp;nbsp;&lt;a href=&quot;http://avro.apache.org/&quot;&gt;Apache Avro &lt;/a&gt;provides a better serialization framework.&amp;nbsp;&lt;/div&gt;&lt;br /&gt;
It provide features like:&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;- Independent Schema - &amp;nbsp;use different schemas for serialization and de-serialization&lt;br /&gt;
&amp;nbsp;- Binary serialization - compact data encoding, and faster data processing&lt;br /&gt;
&amp;nbsp;- Dynamic typing - serialization and deserialization without code generation&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;&amp;nbsp;We can encode data when serializing with Avro: binary or JSON. In the binary file schema is&amp;nbsp;&amp;nbsp;included at the beginning of file. In JSON, the type is defined along with the data.&amp;nbsp;Switching JSON protocol to a binary format in order to achieve better performance is&amp;nbsp;pretty straightforward with Avro. This means less type information needs to be sent with the data and&amp;nbsp;it stores data with its schema means any program can de-serialize the encoded data, which&amp;nbsp;makes a good candidate for &lt;a href=&quot;http://en.wikipedia.org/wiki/Remote_procedure_call&quot;&gt;RPC&lt;/a&gt;.&lt;/div&gt;&lt;br /&gt;
&amp;nbsp;In Avro 1.5 we have to use (this is different from previous versions which had no factory for encoders)&lt;br /&gt;
&amp;nbsp;- &lt;i&gt;&lt;a href=&quot;http://avro.apache.org/docs/current/api/java/org/apache/avro/io/EncoderFactory.html#binaryEncoder(java.io.OutputStream, org.apache.avro.io.BinaryEncoder)&quot;&gt;org.apache.avro.io.EncoderFactory.binaryEncoder(OutputStream out, BinaryEncoder reuse)&lt;/a&gt;&lt;/i&gt; for binary&lt;br /&gt;
&amp;nbsp;- &lt;i&gt;&lt;a href=&quot;http://avro.apache.org/docs/1.5.1/api/java/org/apache/avro/io/EncoderFactory.html#jsonEncoder(org.apache.avro.Schema, org.codehaus.jackson.JsonGenerator)&quot;&gt;org.apache.avro.io.EncoderFactory.jsonEncoder(Schema schema, OutputStream out)&lt;/a&gt;&lt;/i&gt; for JSON&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;The values (Avro supported value types) are put for the schema field name as the key&lt;br /&gt;
&amp;nbsp;in a set of name-value pairs called &lt;a href=&quot;http://avro.apache.org/docs/1.5.1/api/java/org/apache/avro/generic/GenericData.Record.html&quot;&gt;&amp;nbsp;&lt;i&gt;GenericData.Record&lt;/i&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp;Avro supported value types are&lt;br /&gt;
&amp;nbsp; &lt;i&gt;Primitive Types&lt;/i&gt; -&lt;span class=&quot;Apple-tab-span&quot; style=&quot;white-space: pre;&quot;&gt; &lt;/span&gt;null, boolean, int,&lt;span class=&quot;Apple-tab-span&quot; style=&quot;white-space: pre;&quot;&gt; &lt;/span&gt;long, float, double, bytes,&lt;span class=&quot;Apple-tab-span&quot; style=&quot;white-space: pre;&quot;&gt; &lt;/span&gt;string&lt;span class=&quot;Apple-tab-span&quot; style=&quot;white-space: pre;&quot;&gt; &lt;/span&gt;&lt;br /&gt;
&amp;nbsp; &lt;i&gt;Complex Types&lt;/i&gt; - Records, Enums, Arrays, Maps, Unions, Fixed&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp; you can read more about them &amp;nbsp;&lt;a href=&quot;http://avro.apache.org/docs/current/spec.html#schemas&quot;&gt;here&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp; An encoded schema definition to be provided for the record instance. To read/write data,&amp;nbsp;just use put/get methods&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;/div&gt;&amp;nbsp; &amp;nbsp;I have used this serialization mechanism to provide a layout for log4j. The logs will&amp;nbsp;be serialized to avro mechanism.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;github &lt;/b&gt;project is here -&amp;nbsp;&lt;a href=&quot;https://github.com/harisgx/avro-log4j&quot;&gt;https://github.com/harisgx/avro-log4j&lt;/a&gt;&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp; &amp;nbsp;Add the libraries to your project and add new properties to log4j.properties&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;&amp;nbsp; &amp;nbsp;log4j.appender.logger_name.layout=&lt;b&gt;com.avrolog.log4j.layout.AvroLogLayout&lt;/b&gt;&lt;/i&gt;&lt;br /&gt;
&lt;i&gt;&amp;nbsp; &amp;nbsp;log4j.appender.logger_name.layout.Type=&lt;b&gt;json&lt;/b&gt;&lt;/i&gt;&lt;br /&gt;
&lt;i&gt;&amp;nbsp; &amp;nbsp;log4j.appender.logger_name.layout.MDCKeys=mdcKey&lt;/i&gt;&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp;Provide the MDC keys as comma seperated values &lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp; &amp;nbsp;This is the schema&lt;br /&gt;
&lt;script src=&quot;https://gist.github.com/998835.js&quot;&gt;
 
&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&lt;br /&gt;
&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2011/05/using-avro-to-serialize-logs-in-log4j.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj7WdyNoieN9hX9WEt2LetQuKIJtGk4jvgoUAoHh6-j1nXqIHiaY6Zj0aH3iF9h-xoTpd9t1DjB3-eH2Q8UlOonc9RMNpIZ1YgpOlhr5s2eqfezWlyHx6XJonvNGWHU2aFn-x7EWFmuiPg/s72-c/logos.png" height="72" width="72"/><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-7995675645290100262</guid><pubDate>Sun, 22 May 2011 19:28:00 +0000</pubDate><atom:updated>2011-07-14T20:42:11.579+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">algorithms</category><category domain="http://www.blogger.com/atom/ns#">bloomfilter</category><category domain="http://www.blogger.com/atom/ns#">distributed systems</category><category domain="http://www.blogger.com/atom/ns#">hadoop</category><category domain="http://www.blogger.com/atom/ns#">hbase</category><title>Bloom Filters</title><description>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiIM-X_A8E99KrpjYJ_FV-w1ojpRvCHTXsKgEsxPaZwKb0tBRDulEOvb4oEqRfV7lGubnkzse_m35ayZ1Y0x_grATtL0cOZvwEf8JZUzqZ_k6meApSUDVhVYanGYFAgmHivYhqpMRtztp0/s1600/gbf.gif&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;br /&gt;
&lt;/a&gt;&lt;/div&gt;A Bloom filter is a probabilistic data-structure. This can be used to store a set of data in a space-efficient manner. For eg; a distributed cache called &lt;i&gt;Cache Digests&lt;/i&gt; shared as summaries between the nodes to have a global image.&amp;nbsp; &lt;br /&gt;
&lt;br /&gt;
The data-structure can be used to provide membership queries ie. &lt;i&gt;checkIfDataPresentInStore()&lt;/i&gt; If it is to check an element is already inserted in the filter then it will return true, there are no false negatives. But there can be chance if the element not inserted may return true. But the check for that element can be done in the original store ie. the overhead is associated with the rate of false positives. This is different from dictionary in which the hit/miss is deterministic.&lt;br /&gt;
&lt;br /&gt;
For a set of n elements, a bloom filter can be a vector of size m.Initially, all bits are set to 0. For each element e, k hash functions will set k bits in the bit vector to 1. When a query for membership executed, it will check for the bit positions for the set value. If matches all, the queried element is possibly present in the store else, it is sure not present.Each hash function returns the index to set. This means we have to store these m bits per key. So a total of m * N bits of space required. The use of different hash functions results less collision.&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgePnYRFa34BFMGeL82pjvC7HFD7x8bUnprFXNOU3hmQYV7luGcr18x0tTSaxA6yXTf4-hhKhZl8GUJ24-NWiEoKwMcNNcw4Ih7ZqTto_RmGfSyEknExd2qglawgSAzpPCWZ3KTsalnnOs/s1600/gbf.gif&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;213&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgePnYRFa34BFMGeL82pjvC7HFD7x8bUnprFXNOU3hmQYV7luGcr18x0tTSaxA6yXTf4-hhKhZl8GUJ24-NWiEoKwMcNNcw4Ih7ZqTto_RmGfSyEknExd2qglawgSAzpPCWZ3KTsalnnOs/s320/gbf.gif&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;b&gt;Uses&lt;/b&gt;&lt;br /&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;&lt;li&gt;Design a &lt;a href=&quot;http://ipowerinfinity.wordpress.com/2008/03/02/bloom-filters-designing-a-spellchecker/&quot;&gt;spell checker&lt;/a&gt;.&amp;nbsp;&lt;/li&gt;
&lt;li&gt;Database join implementation (&lt;a href=&quot;http://www.dba-oracle.com/t_bloom_filter_hashing.htm&quot;&gt;Oracle&lt;/a&gt;) &amp;nbsp;&lt;/li&gt;
&lt;li&gt;Peer to peer (P2P) &lt;a href=&quot;http://www-math.mit.edu/~steng/18.996/MIT_Talk.ppt&quot;&gt;communication and routing &lt;/a&gt;&amp;nbsp;&lt;/li&gt;
&lt;li&gt;In &lt;a href=&quot;http://hbase.apache.org/&quot;&gt;HBase&lt;/a&gt;, the Bloom filter is stored as meta block in the &lt;a href=&quot;http://hbase.apache.org/docs/r0.20.4/api/org/apache/hadoop/hbase/io/hfile/HFile.html&quot;&gt;HFile&lt;/a&gt;. When a HFile is opened, the bloom filter is loaded into memory and used to determine if a given key is in that store file. This can avoid the scanning region for the key.&amp;nbsp;&lt;/li&gt;
&lt;li&gt;and &lt;a href=&quot;http://cs.unc.edu/~fabian/courses/CS600.624/slides/bloomslides.pdf&quot;&gt;more &lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;&lt;br /&gt;
I found a java implementation &lt;a href=&quot;http://wwwse.inf.tu-dresden.de/xsiena/bloom_filter&quot;&gt;here&lt;/a&gt;&lt;br /&gt;
Cassandra&#39;s java implementation &lt;a href=&quot;https://github.com/jbellis/cassandra-dev/tree/e284df7536ef32869b87d903a5f92f6a96c84801/src/com/facebook/infrastructure/utils&quot;&gt;here&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Reference&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://en.wikipedia.org/wiki/Bloom_filter&quot;&gt;http://en.wikipedia.org/wiki/Bloom_filter&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;https://issues.apache.org/jira/browse/HBASE-1200&quot;&gt;https://issues.apache.org/jira/browse/HBASE-1200&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;http://wiki.squid-cache.org/SquidFaq/CacheDigests&quot;&gt;http://wiki.squid-cache.org/SquidFaq/CacheDigests&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf&quot;&gt;http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2011/05/til-bloom-filters.html</link><author>noreply@blogger.com (Unknown)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgePnYRFa34BFMGeL82pjvC7HFD7x8bUnprFXNOU3hmQYV7luGcr18x0tTSaxA6yXTf4-hhKhZl8GUJ24-NWiEoKwMcNNcw4Ih7ZqTto_RmGfSyEknExd2qglawgSAzpPCWZ3KTsalnnOs/s72-c/gbf.gif" height="72" width="72"/><thr:total>3</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-7594619397377834317</guid><pubDate>Mon, 09 May 2011 18:23:00 +0000</pubDate><atom:updated>2011-08-19T14:47:05.570+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">API</category><category domain="http://www.blogger.com/atom/ns#">codes</category><category domain="http://www.blogger.com/atom/ns#">comet</category><category domain="http://www.blogger.com/atom/ns#">experiments</category><category domain="http://www.blogger.com/atom/ns#">github</category><category domain="http://www.blogger.com/atom/ns#">groovy</category><category domain="http://www.blogger.com/atom/ns#">java</category><category domain="http://www.blogger.com/atom/ns#">twitter</category><title>Labs</title><description>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;line-height: 16px;&quot;&gt;&lt;a href=&quot;https://github.com/harisgx/avro-log4j&quot;&gt;avro-log4j &lt;/a&gt;&amp;nbsp; -&amp;nbsp; &lt;/span&gt;serialization mechanism to provide a layout for log4j&lt;br /&gt;
&lt;br /&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;line-height: 16px;&quot;&gt;&lt;a href=&quot;http://code.google.com/p/firetester/&quot;&gt;&lt;span style=&quot;background-color: white;&quot;&gt;firetester&lt;/span&gt;&amp;nbsp;&lt;/a&gt;&amp;nbsp; - &amp;nbsp;A simple RESTful services testing tool written in&amp;nbsp;&lt;a href=&quot;http://en.wikipedia.org/wiki/Groovy_%28programming_language%29&quot;&gt;Groovy&lt;/a&gt;&amp;nbsp;&lt;a href=&quot;http://en.wikipedia.org/wiki/Griffon_%28framework%29&quot;&gt;Griffon&lt;/a&gt;&amp;nbsp;framework&lt;/span&gt;&lt;br /&gt;
&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit; line-height: 16px;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;a href=&quot;https://github.com/harisgx/gitter&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;background-color: white;&quot;&gt;gitter&lt;/span&gt;&amp;nbsp;&lt;/a&gt;-&amp;nbsp;Publishes github activities to Twitter&lt;/span&gt;&lt;/div&gt;&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;a href=&quot;https://github.com/harisgx/jfilemagic&quot; style=&quot;background-color: white;&quot;&gt;jfilemagic&lt;/a&gt;&lt;span style=&quot;background-color: white;&quot;&gt;&amp;nbsp;&lt;/span&gt;(jfm) is an utility for identifying files using&amp;nbsp;&lt;a href=&quot;http://en.wikipedia.org/wiki/Magic_number_%28programming%29&quot;&gt;magic numbers&lt;/a&gt;&amp;nbsp;or signatures&lt;/span&gt;&lt;/div&gt;&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;div style=&quot;margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;&quot;&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;a href=&quot;http://bytescrolls-samples.googlecode.com/files/comet_sample.war&quot; style=&quot;background-color: white;&quot;&gt;cometd-chat&lt;/a&gt;&amp;nbsp;- a&amp;nbsp;&lt;a href=&quot;http://en.wikipedia.org/wiki/Comet_%28programming%29&quot;&gt;comet&lt;/a&gt;&amp;nbsp;based chatter for fun&lt;/span&gt;&lt;br /&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;br /&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2011/05/labs.html</link><author>noreply@blogger.com (Unknown)</author></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-818850914311213273.post-2362892539164626091</guid><pubDate>Wed, 13 Apr 2011 17:54:00 +0000</pubDate><atom:updated>2011-04-13T23:54:50.450+05:30</atom:updated><category domain="http://www.blogger.com/atom/ns#">java</category><category domain="http://www.blogger.com/atom/ns#">JVM</category><category domain="http://www.blogger.com/atom/ns#">programming</category><title>Interesting uses of sun.misc.Unsafe</title><description>Inspired from the &lt;a href=&quot;http://stackoverflow.com/questions/5574241/interesting-uses-of-sun-misc-unsafe&quot;&gt;question&lt;/a&gt; that found in stackoverflow, I started looking up for the uses. I found some pretty interesting ones...&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;div&gt;VM &quot;intrinsification.&quot; ie CAS (Compare-And-Swap) used in &lt;a href=&quot;http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf&quot;&gt;Lock-Free Hash Table&lt;/a&gt;s eg:&lt;a href=&quot;http://www.j7a.ru/classsun_1_1misc_1_1_unsafe.html#a9463edc21c5fb733b7f43bb02697fd8a&quot;&gt;sun.misc.Unsafe.compareAndSwapInt&lt;/a&gt; it can make real &lt;a href=&quot;http://en.wikipedia.org/wiki/Java_Native_Interface&quot;&gt;JNI&lt;/a&gt; calls into native code that contains special instructions for CAS&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;div&gt;What is intrinsification?&lt;/div&gt;&lt;div&gt;They are optimization done like compiler generating code directly for called method or JVM native optimizations. We know that there are VM downcalls from JDK like wait method etc. Its about low level programming. For eg:- the &lt;a href=&quot;http://download.oracle.com/javase/1.5.0/docs/api/java/util/concurrent/atomic/package-summary.html&quot;&gt;Atomic&lt;/a&gt; classes for numbers, they are pure numbers represented by objects but atomically modified in which the operations are managed natively.&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;read more about CAS here &lt;a href=&quot;http://en.wikipedia.org/wiki/Compare-and-swap&quot;&gt;http://en.wikipedia.org/wiki/Compare-and-swap&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;The &lt;a href=&quot;http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/src/dl/sun/misc/Unsafe.java?view=markup&quot;&gt;sun.misc.Unsafe&lt;/a&gt; functionality of the host VM can be used to allocate uninitialized objects and then interpret the constructor invocation as any other method call.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;One can track the data from the native address.It is possible to retrieve an object’s memory address using the sun.misc.Unsafe class, and operate on its fields directly via unsafe get/put methods!&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Compile time optimizations for JVM. HIgh performance VM using &quot;magic&quot;, requiring low-level operations. eg: &lt;a href=&quot;http://en.wikipedia.org/wiki/Jikes_RVM&quot;&gt;http://en.wikipedia.org/wiki/Jikes_RVM&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Allocating memory, &lt;a href=&quot;http://www.j7a.ru/classsun_1_1misc_1_1_unsafe.html#a9463edc21c5fb733b7f43bb02697fd8a&quot;&gt;sun.misc.Unsafe.allocateMemory&lt;/a&gt; eg:- DirectByteBuffer constructor internally calls it when &lt;a href=&quot;http://download.oracle.com/javase/1.5.0/docs/api/java/nio/ByteBuffer.html&quot;&gt;ByteBuffer.allocateDirect&lt;/a&gt; is invoked&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Tracing the call stack and replaying with values instantiated by sun.misc.Unsafe, useful for instrumentation&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://www.j7a.ru/classsun_1_1misc_1_1_unsafe.html#a779b0fead7abb6b98b17f1dfa3f77352&quot;&gt;sun.misc.Unsafe.arrayBaseOffset&lt;/a&gt; and &lt;a href=&quot;http://www.j7a.ru/classsun_1_1misc_1_1_unsafe.html#ac1d7700aabc2e524a8e5286ed8605e25&quot;&gt;arrayIndexScale&lt;/a&gt; can be used to develop &lt;a href=&quot;http://domino.watson.ibm.com/comm/research.nsf/pages/r.plansoft.innovation.html&quot;&gt;arraylets&lt;/a&gt;, a technique for efficiently breaking up large arrays into smaller objects to limit the real-time cost of scan, update or move operations on large objects&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;b&gt;References&lt;/b&gt;&lt;/div&gt;&lt;div&gt;&lt;b&gt;&lt;br /&gt;&lt;/b&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://domino.watson.ibm.com/comm/research_people.nsf/pages/bacon.refereed-pubs.html/$FILE/Auerbach07DesignTR.pdf&quot;&gt;Design and implementation of a comprehensive real time java virtual machine&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;div&gt;&lt;a href=&quot;http://cs.anu.edu.au/~Robin.Garner/vee25-frampton.pdf&quot;&gt;Demystifying Magic: High-level Low-level Programming&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://codespeak.net/svn/user/cfbolz/jitpl/doc/gal_papers/BebenitaGalFranz_JavainJava.pdf&quot;&gt;Implementing Fast JVM Interpreters Using Java Itself&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://plrg.kaist.ac.kr/_media/activities/labseminars/taintdroid.pdf&quot;&gt;TaintDroid: An Information-Flow Tracking System for Realtime Privacy Monitoring on Smartphones&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://www.cse.ohio-state.edu/~rountev/presto/pubs/fse07.pdf&quot;&gt;Efficient Checkpointing of Java Software Using Context-Sensitive Capture and Replay&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;a href=&quot;http://robaustin.wikidot.com/how-to-write-to-direct-memory-locations-in-java&quot;&gt;How To Write Directly to a Memory Locations In Java&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;</description><link>http://bytescrolls.blogspot.com/2011/04/interesting-uses-of-sunmiscunsafe.html</link><author>noreply@blogger.com (Unknown)</author><thr:total>1</thr:total></item></channel></rss>