<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/atom10full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" gd:etag="W/&quot;DEAGRXg9fyp7ImA9WhBVFUQ.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631</id><updated>2013-04-21T22:12:04.667-04:00</updated><category term="clustering" /><category term="meetup" /><category term="data mining" /><category term="funny" /><category term="comic" /><category term="privacy" /><category term="structural models." /><category term="api" /><category term="bayesian" /><category term="FROC" /><category term="presentation" /><category term="academia" /><category term="psychology" /><category term="string matching" /><category term="adwords" /><category term="market for lemons" /><category term="spam" /><category term="web service" /><category term="wisdom of the crowds" /><category term="presidential elections 2008" /><category term="open access" /><category term="get-another-label" /><category term="lda" /><category term="probability" /><category term="fraud" /><category term="cfp" /><category term="humor" /><category term="frequentist" /><category term="power law" /><category term="deduplication" /><category term="visualization" /><category term="attack" /><category term="keyword bidding" /><category term="mechanical turk" /><category term="advice" /><category term="reviews" /><category term="odesk" /><category term="customer service" /><category term="outliers" /><category term="acm" /><category term="dagstuhl" /><category term="data cleaning" /><category term="honda" /><category term="hcomp" /><category term="prediction markets" /><category term="incentives" /><category term="human computation" /><category term="call for papers" /><category term="drm" /><category term="peer reviewing" /><category term="online advertising" /><category term="efficient markets" /><category term="interviews" /><category term="payment" /><category term="quality" /><category term="statistics" /><category term="crowdsourcing" /><category term="google" /><category term="pricing" /><category term="yahoo" /><category term="education" /><category term="aca" /><category term="newsweek" /><category term="slides" /><category term="skills" /><category term="Rudy Giuliani" /><category term="extreme value theory" /><category term="reputation" /><category term="youtube" /><category term="mind maps" /><category term="conference" /><category term="demo" /><category term="large datasets" /><category term="propublica" /><category term="ranked xml querying" /><category term="wikisynonyms" /><category term="evaluation" /><category term="cheating" /><category term="adsafe" /><category term="charity" /><category term="information extraction" /><category term="amazon" /><category term="ec2012" /><category term="wikis" /><category term="tagasauris" /><category term="industry analysis" /><category term="Mitt Romney" /><category term="embed" /><category term="teaching" /><category term="powerpoint" /><category term="computer science" /><category term="image classification" /><category term="research" /><category term="cloud computing" /><category term="trec" /><category term="reduced models" /><category term="mashape" /><category term="tutorial" /><category term="synonyms" /><category term="google spreadsheet" /><category term="wikipedia" /><category term="economics" /><category term="www2011" /><category term="minimum wage" /><category term="surveys" /><category term="csdm" /><category term="intellectual property" /><category term="search" /><category term="businessweek" /><category term="microsoft" /><category term="Hillary Clinton" /><category term="online labor" /><category term="independence" /><category term="readability" /><category term="machine learning" /><category term="ROC" /><category term="publishers" /><category term="cognitive dissonance" /><category term="gmail" /><category term="dirichlet" /><category term="merger" /><category term="assembly line" /><category term="typesetting" /><category term="intrade" /><title>A Computer Scientist in a Business School</title><subtitle type="html">Random thoughts of a computer scientist who is working behind the enemy lines; and lately turned into a double agent.</subtitle><link rel="http://schemas.google.com/g/2005#feed" type="application/atom+xml" href="http://www.behind-the-enemy-lines.com/feeds/posts/default" /><link rel="alternate" type="text/html" href="http://www.behind-the-enemy-lines.com/" /><link rel="next" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default?start-index=26&amp;max-results=25&amp;redirect=false&amp;v=2" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><generator version="7.00" uri="http://www.blogger.com">Blogger</generator><openSearch:totalResults>222</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/atom+xml" href="http://feeds.feedburner.com/AComputerScientistInABusinessSchool" /><feedburner:info uri="acomputerscientistinabusinessschool" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><geo:lat>40.72596</geo:lat><geo:long>-73.998345</geo:long><link rel="license" type="text/html" href="http://creativecommons.org/licenses/by/3.0/" /><logo>http://creativecommons.org/images/public/somerights20.gif</logo><feedburner:emailServiceId>AComputerScientistInABusinessSchool</feedburner:emailServiceId><feedburner:feedburnerHostname>http://feedburner.google.com</feedburner:feedburnerHostname><entry gd:etag="W/&quot;A04MQ307fCp7ImA9WhBWEE0.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6687465958846773715</id><published>2013-04-02T15:04:00.003-04:00</published><updated>2013-04-03T13:26:22.304-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-04-03T13:26:22.304-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="intrade" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="prediction markets" /><category scheme="http://www.blogger.com/atom/ns#" term="large datasets" /><title>Intrade Archive: Data for Posterity</title><content type="html">A few years back, I have done &lt;a href="http://www.ipeirotis.com/research/publications/detecting-important-events-using-prediction-markets-text-mining-and-volatility-modeling" target="_blank"&gt;some&lt;/a&gt; &lt;a href="http://www.ipeirotis.com/research/publications/modeling-dependency-in-prediction-markets" target="_blank"&gt;work on&lt;/a&gt; &lt;a href="http://www.ipeirotis.com/research/publications/modeling-volatility-in-prediction-markets" target="_blank"&gt;prediction markets&lt;/a&gt;. For this line of research, we have been collecting data from Intrade, to perform our experimental analysis. Some of the data is available through the &lt;a href="http://intrade-archive.appspot.com/" target="_blank"&gt;Intrade Archive&lt;/a&gt;, a web app that I wrote in order to familiarize myself with the Google App Engine.&lt;br /&gt;
&lt;br /&gt;
In the last few weeks, through, after the effective shutdown of Intrade, I started receiving requests on getting access to the data stored in the Intrade Archive. So, after popular demand, I gathered all the data from the Intrade Archive, and also all the past data that I had about all the Intrade contracts going back to 2003, and &lt;a href="https://github.com/ipeirotis/Intrade-Archive/tree/master/data" target="_blank"&gt;I put them all on GitHub&lt;/a&gt; for everyone to access and download. &lt;b&gt;The Excel file contains a description of the contracts, while the zip file contains information about all the individual trades and the daily opening and closing prices.&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
On purpose, I exclude all the Financial contracts, as the trading of these events have limited research interest. (Plus, they were too many of them.) The information from "official" stock and options exchanges has much higher volume and is a better source of information than the comparatively illiquid contracts on Intrade.&lt;br /&gt;
&lt;br /&gt;
The link to the &lt;a href="https://github.com/ipeirotis/Intrade-Archive/tree/master/data" target="_blank"&gt;GitHub repository&lt;/a&gt; is also now available from the&lt;a href="http://intrade-archive.appspot.com/" target="_blank"&gt; home page of the Intrade Archive&lt;/a&gt;. I hope that the resource hungry crawlers can now be put to sleep, not to ever come back again :-)&lt;br /&gt;
&lt;br /&gt;
Enjoy!&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=5KHP8oRTM4M:Feu4BxTzW6Y:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=5KHP8oRTM4M:Feu4BxTzW6Y:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=5KHP8oRTM4M:Feu4BxTzW6Y:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=5KHP8oRTM4M:Feu4BxTzW6Y:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=5KHP8oRTM4M:Feu4BxTzW6Y:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=5KHP8oRTM4M:Feu4BxTzW6Y:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=5KHP8oRTM4M:Feu4BxTzW6Y:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/5KHP8oRTM4M" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6687465958846773715?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6687465958846773715?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/5KHP8oRTM4M/intrade-archive-data-for-posterity.html" title="Intrade Archive: Data for Posterity" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2013/04/intrade-archive-data-for-posterity.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkUGRXg-eip7ImA9WhBSGEg.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6964174497702160722</id><published>2013-02-25T22:38:00.002-05:00</published><updated>2013-02-25T22:43:44.652-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-02-25T22:43:44.652-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="wikisynonyms" /><category scheme="http://www.blogger.com/atom/ns#" term="wikipedia" /><category scheme="http://www.blogger.com/atom/ns#" term="synonyms" /><category scheme="http://www.blogger.com/atom/ns#" term="skills" /><category scheme="http://www.blogger.com/atom/ns#" term="web service" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>WikiSynonyms: Find synonyms using Wikipedia redirects</title><content type="html">&lt;div style="text-align: justify;"&gt;
Many many years back, I &lt;a href="http://www.ipeirotis.com/research/publications/automatic-extraction-of-useful-facet-hierarchies-from-text-databases" target="_blank"&gt;worked with Wisam Dakka on a paper to create faceted interfaced for text collections&lt;/a&gt;. One of the requirements for that project was to discover synonyms for named entities. While we explored a variety of directions, the one that I liked most was Wisam's idea to use the Wikipedia &lt;b&gt;&lt;i&gt;redirects &lt;/i&gt;&lt;/b&gt;to discover terms that are mostly synonymous.&lt;/div&gt;
&lt;div&gt;
&lt;div&gt;
&lt;style type="text/css"&gt;
.gist .gist-file .gist-data {max-height: 200px;font-size:12px;line-height:13px;margin-bottom:0px;width:100%}
.gist pre{font-family:Menlo,Monaco,'Bitstream Vera Sans Mono','Courier New',monospace !important}
.gist-meta{font-family:Helvetica,Arial,sans-serif;font-size:13px !important}
.gist-meta a{color:#26a !important;text-decoration:none}
.gist-meta a:hover{color:#0e4071 !important}
&lt;/style&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
Did you know, for example, that &lt;i&gt;ISO/IEC 14882:2003&lt;/i&gt; and &lt;i&gt;X3J16 &lt;/i&gt;are synonyms of C++? Yes, me neither. However, Wikipedia reveals that through its redirect structure.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;div style="text-align: start;"&gt;
&lt;b&gt;The Wikisynonyms web service&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
What we mean by redirects? Well, if you try to visit the Wikipedia page for &lt;a href="http://en.wikipedia.org/wiki/President_Obama" target="_blank"&gt;President Obama&lt;/a&gt;, you will be redirected to the canonical page &lt;a href="http://en.wikipedia.org/wiki/Barack_Obama" target="_blank"&gt;Barack Obama&lt;/a&gt;. Effectively "President Obama" is deemed by Wikipedians to be a close synonym of "Barack Obama", and therefore the redirect. Similarly, the term "Obama" is also a redirect, etc. (You can check the full list of redirects &lt;a href="http://toolserver.org/~dispenser/cgi-bin/rdcheck.py?page=Barack_Obama" target="_blank"&gt;here&lt;/a&gt;.)&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
While I was visiting oDesk, I felt that this service can be useful for a variety of purposes so, following the oDesk model, we hired a contractor to implement this synonym extraction as a web API and service. If you want to try it out please go to:&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://wikisynonyms.ipeirotis.com/search"&gt;&lt;span style="font-size: large;"&gt;http://wikisynonyms.ipeirotis.com/search&lt;/span&gt;&lt;/a&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
The &lt;a href="http://wikisynonyms.ipeirotis.com/page/api" target="_blank"&gt;API &lt;/a&gt;is very simple. Just issue a GET request like this:&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="background-color: white; color: #333333; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 14px; line-height: 20px;"&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-family: Courier New, Courier, monospace; font-size: small;"&gt;curl 'http://wikisynonyms.ipeirotis.com/api/&lt;b&gt;{TERM}&lt;/b&gt;'&amp;nbsp;&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
For example, to find synonyms for &lt;i&gt;Hillary Clinton&lt;/i&gt;:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-family: 'Courier New', Courier, monospace; font-size: small;"&gt;curl '&lt;a href="http://wikisynonyms.ipeirotis.com/api/Hillary_Clinton"&gt;http://wikisynonyms.ipeirotis.com/api/Hillary_Clinton&lt;/a&gt;&lt;/span&gt;&lt;span style="font-family: 'Courier New', Courier, monospace; font-size: small;"&gt;'&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-family: 'Courier New', Courier, monospace; font-size: small;"&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;script src="https://gist.github.com/ipeirotis/5033912.js"&gt;&lt;/script&gt;&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
and for &lt;i&gt;Obama&lt;/i&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-family: 'Courier New', Courier, monospace; font-size: small;"&gt;curl '&lt;a href="http://wikisynonyms.ipeirotis.com/api/Obama"&gt;http://wikisynonyms.ipeirotis.com/api/Obama&lt;/a&gt;&lt;/span&gt;&lt;span style="font-family: 'Courier New', Courier, monospace; font-size: small;"&gt;'&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;script src="https://gist.github.com/ipeirotis/5033845.js"&gt;&lt;/script&gt;&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Mashape integration&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Since we may change the URL of the service, I would recommend registering and using &lt;a href="https://www.mashape.com/" target="_blank"&gt;Mashape&lt;/a&gt; to access the &lt;a href="https://www.mashape.com/ipeirotis/wikisynonyms" target="_blank"&gt;WikiSynonyms service through Mashape&lt;/a&gt;&amp;nbsp;instead:&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-family: Courier New, Courier, monospace; font-size: small;"&gt;curl 'https://wikisynonyms.p.mashape.com/{TERM}' --header 'X-Mashape-Authorization: your_mashape_key'&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;You can &lt;i&gt;easily&lt;/i&gt; download Wikipedia&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
Interestingly enough, this synonym extraction technique remains little-known, despite the easiness of extracting these synonyms. And whenever I mention Wikipedia, most people are worried that they will need to scrape the HTML from Wikipedia, and nobody likes this monkey business.&amp;nbsp;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
Strangely, most people are unaware that you can download Wikipedia in a relational form and put it directly in a database. In fact, you can download only the parts that you need. Here are the basic links:&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;The Wikipedia schema is at &lt;a href="http://www.mediawiki.org/wiki/Manual:Database_layout"&gt;http://www.mediawiki.org/wiki/Manual:Database_layout&lt;/a&gt;&amp;nbsp;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;The files are at&amp;nbsp;&lt;a href="http://dumps.wikimedia.org/enwiki/latest/"&gt;http://dumps.wikimedia.org/enwiki/latest/&lt;/a&gt; and you can download each table individually.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;To implement the synonyms service, you only need to fetch the&amp;nbsp;&lt;a href="http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-redirect.sql.gz" target="_blank"&gt;redirect table&lt;/a&gt;&amp;nbsp;and the&amp;nbsp;&lt;a href="http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz" target="_blank"&gt;page table&lt;/a&gt;.&amp;nbsp;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;More instructions at &lt;a href="http://en.wikipedia.org/wiki/Wikipedia:Database_download"&gt;http://en.wikipedia.org/wiki/Wikipedia:Database_download&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
This redirect structure (as opposed, say to the normal link structure and the related anchor text) is highly precise. By eyeballing the results, I would guess that precision is around 97% to 99%.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Application: Extracting synonyms of oDesk skills&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
One application that we used the service was to extract synonyms for the set of skills that are used to annotate the jobs posted on oDesk. For example, you can find the synonyms for &lt;a href="http://wikisynonyms.ipeirotis.com/api/c%2B%2B" target="_blank"&gt;C++&lt;/a&gt;:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;script src="https://gist.github.com/ipeirotis/5033925.js"&gt;&lt;/script&gt;&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Or you can find the synonyms for &lt;a href="http://wikisynonyms.ipeirotis.com/api/python" target="_blank"&gt;Python&lt;/a&gt;:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;script src="https://gist.github.com/ipeirotis/5035590.js"&gt;&lt;/script&gt;&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Oops, as you see the term Python is actually ambiguous, and Wikipedia has a disambiguation page with the different 'senses' of the term. Since we are not doing any automatic disambiguation, we return a 300 HTTP response and ask the user to select one of the applicable terms. So, if we query now with the term '&lt;a href="http://wikisynonyms.ipeirotis.com/api/Python%20(programming%20language)" target="_blank"&gt;Python (programming language)&lt;/a&gt;' we get:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;script src="https://gist.github.com/ipeirotis/5035606.js"&gt;&lt;/script&gt;&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Open source and waiting for feedback&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The source code together with the installation instructions for the service is &lt;a href="https://github.com/ipeirotis/wikiSyno" target="_blank"&gt;available on GitHub&lt;/a&gt;. Feel free to point any problems or suggestions for improvement. And thank &lt;a href="https://www.odesk.com/info/l/research/" target="_blank"&gt;oDesk Research&lt;/a&gt; for all the support in creating the service and making it open source for everyone to use.&lt;/div&gt;
&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Hd1tVW7URbk:8gKNbjT3DKM:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Hd1tVW7URbk:8gKNbjT3DKM:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Hd1tVW7URbk:8gKNbjT3DKM:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Hd1tVW7URbk:8gKNbjT3DKM:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Hd1tVW7URbk:8gKNbjT3DKM:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Hd1tVW7URbk:8gKNbjT3DKM:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Hd1tVW7URbk:8gKNbjT3DKM:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/Hd1tVW7URbk" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6964174497702160722?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6964174497702160722?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/Hd1tVW7URbk/wikisynonyms-find-synonyms-using.html" title="WikiSynonyms: Find synonyms using Wikipedia redirects" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2013/02/wikisynonyms-find-synonyms-using.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkUGQns-eCp7ImA9WhNaFE0.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-7521830930515273037</id><published>2013-01-28T11:26:00.002-05:00</published><updated>2013-01-28T15:17:03.550-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-28T15:17:03.550-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="api" /><category scheme="http://www.blogger.com/atom/ns#" term="mashape" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Towards a Market for Intelligence</title><content type="html">&lt;div style="text-align: justify;"&gt;
Last September, I was visiting CMU and a student asked me a question: "Do you know any crowdsourcing market, where we can assign tasks to people, as opposed to waiting for the workers to pick the tasks they want to work on?" &lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
Most crowdsourcing services do not satisfy this requirement. Mechanical Turk, oDesk, eLance, and all others typically expect the workers to express interest to a task. At most, you may be able to invite workers to participate in a task, but you cannot really assign a task to a worker.&amp;nbsp;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
A notable exception is Fiverr, which plays the supply side of the market; however, Fiverr has a different limitation: The tasks that can be performed are posted by the workers, and the employers pick from a set of existing tasks.&amp;nbsp;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
After thinking for a while, I realized that there more such markets in which you can assign tasks to "workers". The main difference is that the "workers" are not necessarily humans, but APIs. Enter the world of &lt;a href="https://www.mashape.com/" target="_blank"&gt;Mashape&lt;/a&gt;, an API marketplace. Are you looking for someone to classify tweets according to their sentiment? &lt;a href="https://www.mashape.com/search?query=sentiment" target="_blank"&gt;Query the Mashape marketplace for sentiment analysis API's&lt;/a&gt;&amp;nbsp;and then assign the task to the intelligence units of your choice.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
With the advent of the API marketplaces, we see the emergence of marketplaces for "intelligence units". All these intelligence units (API's, or human workers) have different levels of quality, various levels of pricing, and even various levels of capacity and responsiveness.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
With the proper abstraction, a task management platform, can use and optimize these distributed intelligence units without having to worry about the "implementation details" of this intelligence.&lt;/div&gt;
&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Yu9ggPt6wcE:Ps3-VQ1jPD8:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Yu9ggPt6wcE:Ps3-VQ1jPD8:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Yu9ggPt6wcE:Ps3-VQ1jPD8:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Yu9ggPt6wcE:Ps3-VQ1jPD8:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Yu9ggPt6wcE:Ps3-VQ1jPD8:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Yu9ggPt6wcE:Ps3-VQ1jPD8:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Yu9ggPt6wcE:Ps3-VQ1jPD8:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/Yu9ggPt6wcE" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7521830930515273037?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7521830930515273037?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/Yu9ggPt6wcE/towards-market-for-intelligence.html" title="Towards a Market for Intelligence" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2013/01/towards-market-for-intelligence.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0MNQXsyfCp7ImA9WhNXF0s.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-7746216538671591721</id><published>2012-12-05T14:38:00.000-05:00</published><updated>2012-12-05T23:51:30.594-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-12-05T23:51:30.594-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="quality" /><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="reputation" /><title>Mechanical Turk changing the defaults: The game has changed</title><content type="html">&lt;div style="text-align: justify;"&gt;
Back in the summer of 2011, Mechanical Turk introduced a new type of qualification, &lt;a href="http://mechanicalturk.typepad.com/blog/2011/06/get-better-results-with-less-effort-with-mechanical-turk-masters-.html"&gt;the Mechanical Turk "Masters"&lt;/a&gt;. The Master qualification was assigned by Amazon to workers that have proven themselves in the marketplace. &lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
What exactly makes someone "proven"? This is, understandably, a well-kept secret by Amazon. The opacity of the qualification process annoys many workers: It is hard to prove that you are a Master and qualify for it, when you do not know how this qualification is granted. The rumor says that Amazon deploys decoy tasks on Mechanical Turk just to examine the performance of the workers and decide which ones to qualify as Masters. If this is correct, then it also explains why Amazon is rather secretive about the exact requirements: Workers would try to ace these test tasks, and let their guards down in others.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The existence of Masters was an good development towards creating a true reputation scheme for&amp;nbsp;Mechanical Turk.&amp;nbsp; However, an action taken by Amazon a month back has changed the dynamic of the market: &lt;b&gt;Now&amp;nbsp;&lt;/b&gt;&lt;b&gt;the &lt;i&gt;default&amp;nbsp;&lt;/i&gt;&lt;/b&gt;&lt;b&gt;requirement, for all tasks created through the UI interface,&amp;nbsp;&lt;/b&gt;&lt;b&gt;is to require using Masters workers&lt;/b&gt;&lt;b&gt;.&lt;/b&gt; Removing the requirement is done only through the "advanced" menu, and is followed by a warning that you may not get good results if you opt not to use Masters.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Tiny change? No. This is huge. Here are a few of the immediate, positive effects:&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;People that use the Web UI are typically the newcomers, that do not know (or want) to implement sophisticated quality control schemes. They just want to execute some simple tasks. The task templates help a lot to create a usable interface, and the Masters requirement ensures that they are not going to get back crappy results. A happy customer, is a long term customer.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Masters will not touch badly designed and ambiguous tasks. This enforces discipline from the requester side, to get things designed properly. Otherwise the tasks are left untouched, which is a good signal that something is wrong with the task.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Masters will not touch offensively priced tasks, paying less than minimum wage, while demanding high-quality work. This (hurray!) removes the impression that Mechanical Turk is about dirty cheap work and emphasizes what crowdsourcing is about: Dynamic allocation of labor on tasks, without the overhead of hiring, negotiations, etc.&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
There are of course, a few downsides:&lt;/div&gt;
&lt;div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;There are much fewer Masters workers. A current search reveals&amp;nbsp;20,744 workers. This is at least an order of magnitude lower than the number of active workers that Amazon used to advertise. Of course, these Masters are much more active than the average worker, but still there are not enough of them for all the tasks that require them.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;There is now a significant lag in the task being picked by workers. Masters are much more careful about the requesters they work with, and a new requester will need to prove that is not rejecting work unfairly, and that they pay on time. Until then, the task will get only a few workers willing to test it.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;The tasks now take much longer to complete. My current sense is that there is a 10x slowdown, (but the improvement in quality is definitely worth it).&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;There is an increased cost. Masters require decent wages (so no more 5 cents for 5-minutes of work), and there is an increased overhead from Amazon (30% overhead for Masters vs 10% for regular workers). My take? You get what you pay for.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;It is not clear in what tasks the Masters are tested and how a new worker can become a master. It would be great if Amazon also gets quality signals from a few reliable big requesters, but I can see many practical problems in implementing such a solution.&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
Overall though, this change in the defaults is showing that Amazon started acting on the criticism. It is clear that this is a risky move, as there will be a lot of work posted on&amp;nbsp;Mechanical Turk&amp;nbsp;will not get done due to lack of interest for poorly paying or badly designed tasks.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
But on the other hand, it shows that Amazon is looking for the long term: Let newcomer requesters get guaranteed results, and if they want to get things done faster they can focus on pricing and better task design. If they want to get further and engage other Turkers, such requesters will be aware of the risks and benefits of such a move.&lt;br /&gt;
&lt;br /&gt;
So, effectively now we have the "novice" requesters, who get protected by default through the Masters qualification, and the "advanced" requesters that can implement their own qualification schemes to replace the Masters qualification. This default level of protection makes the life of wannabe-scammer workers very difficult: no obvious victims to attack. Just hunting down for a victim requester will become so difficult that it makes sense to just give up scamming and either convert into doing real work, or abandon the market.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
A tiny change in the defaults with short-term problems and many big, long-term benefits. Personally, I find this move exhilarating.&lt;/div&gt;
&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=B8TSkZ0kRs0:V6PJXtXpy1E:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=B8TSkZ0kRs0:V6PJXtXpy1E:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=B8TSkZ0kRs0:V6PJXtXpy1E:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=B8TSkZ0kRs0:V6PJXtXpy1E:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=B8TSkZ0kRs0:V6PJXtXpy1E:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=B8TSkZ0kRs0:V6PJXtXpy1E:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=B8TSkZ0kRs0:V6PJXtXpy1E:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/B8TSkZ0kRs0" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7746216538671591721?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7746216538671591721?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/B8TSkZ0kRs0/mechanical-turk-changing-defaults-game.html" title="Mechanical Turk changing the defaults: The game has changed" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/12/mechanical-turk-changing-defaults-game.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck8NQHc5eyp7ImA9WhNQEks.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-7271497387596000551</id><published>2012-11-18T00:49:00.004-05:00</published><updated>2012-11-18T12:48:11.923-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-11-18T12:48:11.923-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><title>How big is Mechanical Turk?</title><content type="html">&lt;div style="text-align: justify;"&gt;
A question that people ask me very often is about the size of Mechanical Turk. How many tasks are being completed on the marketplace every day? What is the transaction volume? Let me give a quick answer: I have no idea. Since Amazon does not release any statistics about the marketplace, it is pretty much impossible to know for sure.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Mechanical Turk Tracker&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
However, I do have some estimates, mainly by using the data that I have been collecting through the Amazon &lt;a href="http://mturk-tracker.com/general/" target="_blank"&gt;Mechanical Turk Tracker&lt;/a&gt;. For those not familiar with the site, over the last four years, we are crawling the Mechanical Turk site every few minutes and we capture the complete state of the market: What tasks are available, their prices, the number of HITs available, etc.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
One feature that we revamped lately is the ability to see the number of tasks that are posted and completed every day. &lt;a href="http://mturk-tracker.com/arrivals/" target="_blank"&gt;You can check the "Arrivals" tab&lt;/a&gt;&amp;nbsp;to see the details.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://mturk-tracker.com/arrivals/" target="_blank"&gt;&lt;img border="0" height="251" src="http://1.bp.blogspot.com/-0S_olSCGiyA/UKhuTvU1gkI/AAAAAAAAz7Y/YlZ9nwwDOnc/s400/mturk-arrivals.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Estimating HITs posted and completed&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
How do we estimate the number of tasks that get posted and completed?&amp;nbsp;The estimation is a little bit tricky and not 100% foolproof but it works reasonably well, based on my current observations.&lt;br /&gt;
&lt;br /&gt;
Since we can keep track of the history of a task over time, we can see the changes in the number of available HITs over time. For example, we may observe a task that has the following number of HITs in sequential crawls, over time:&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;div style="text-align: center;"&gt;
1000...700...500...2000...1000...100...[disappeared]&lt;/div&gt;
&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
For this task, we estimate that we have an initial posting of 1000 HITs. Then, we see 1000-700 = 300 HITs completed between the first and second crawl. Then, 700-500=200 HITs completed between the second and third crawls. However, between the third and fourth crawl we see a "refill" with 2000-500=1500 HITs, which have been posted. Then we see 2000-1000 = 1000 HITs being completed, then 1000-100=900 HITs completed, and finally the task disappears and the last 100 HITs are assumed to be completed. This generates a total of 1000+1500 HITs posted, and 300+200+1000+900+100 HITs completed.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
We do have some extra sanity tests but let's consider the current description as sufficient. For the record, I have checked with a few big requesters and my estimated numbers were pretty close to the actual ones, so I feel reasonably confident that I am not off completely.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Analyzing daily volumes&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Now, by looking at the current arrivals data, we can see that my tracker estimates approximate \$30K-\$40K of tasks completed per day. Given that I cannot observe redundancy, and that I may miss HITs that are getting posted and completed between my crawls, I may be underestimating. However, I may also be wrong by considering as "completed" tasks that were simply taken down, without being done. To be on the safe side, I will put my under-reporting factor somewhere between 1 to 10. In other words, I estimate the real daily volume to be somewhere between \$30K to \$400K. Yes, there is a huge difference between the two, but we get the order of magnitude, and you can be as pessimistic or as optimistic as you want.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;These numbers generate a yearly transaction volume for Mechanical Turk between \$10M and \$150M.&lt;/span&gt;&lt;/b&gt; Given that Mechanical Turk takes 10% to 20% as fees, this is a revenue for Amazon between \$1M (low estimate) to $30M (high estimate) per year.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;What would be the value of Mechanical Turk as a startup?&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
I love that question. Not because it is sensible. But because I get to be completely tongue-in-cheek, and make fun of the absolutely ridiculous P/E ration for the Amazon stock: Currently &lt;a href="http://finance.yahoo.com/q/ks?s=AMZN+Key+Statistics" target="_blank"&gt;the trailing P/E for Amazon is a wonderful 2,681&lt;/a&gt; (yep, not a typo). &lt;b&gt;Assuming that the Mechanical Turk division generates some earnings in the \$1M to \$5M range, the valuation of Mechanical Turk is somewhere between \$2 billion to \$10 billion dollars! Not shabby for a 7-year old startup :-p.&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
OK, getting more serious: The price-to-sales ratio for Amazon is somewhere in the 1.75 range. Therefore, given an estimated&amp;nbsp;yearly transaction volume for Mechanical Turk between \$10M and \$150M,&amp;nbsp;&lt;span style="color: #990000;"&gt;&lt;b&gt;the estimated valuation for Amazon Mechanical Turk is somewhere between \$15M (pathetic) to \$250M (respectable).&lt;/b&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;What is the growth?&lt;/b&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
While I am less certain about the numbers that have to do with the absolute transaction volume, I am much more confident about the growth numbers. Since my methodology remained the same over time, the growth of the sample should match reasonably well the growth of the overall market.&lt;br /&gt;
&lt;br /&gt;
If you go again to the Arrivals tab on Mechanical Turk Tracker, and change the date range to go back to 2009, you will be able to see how the arrivals and completions have changed over time.&lt;/div&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://mturk-tracker.com/arrivals/?date_from=01%2F01%2F2008&amp;amp;date_to=11%2F19%2F2012" target="_blank"&gt;&lt;img border="0" height="256" src="http://4.bp.blogspot.com/-8Qf_fF99eiA/UKhzYp7jQfI/AAAAAAAAz7o/jpZhsZsxeDg/s400/mturk-growth.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Forget about the absolute numbers. What is very clear is the last few years were very good for Mechanical Turk. While the numbers were pretty low early on, there was a 3x to 6x YoY growth in terms of transaction volume. This was really healthy.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
One thing that puzzles me is what happened around March 2012. My tracker seems to detect a sudden stop in the growth. I am not quite sure what is going on there. Is there something about my crawler? Did something change on the Mechanical Turk site that caused a lower rate of completed jobs? I noticed for example, that now Amazon puts the "Masters" qualification as a default option for all the HITs posted through the web interface. This can definitely decrease the rate of completing jobs but I am sure that it will also increase the overall level of satisfaction of the requesters with the answers submitted by the Turkers. Anyhoo, I have not enough information, so I do not want to try to overanalyze that part.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Conclusion&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Mechanical Turk is an interesting experiment for Amazon. It is not clear how important is the project for the rest of the company and how much Jeff Bezos supports the effort after all these years. But Bezos is well-known for planning for the long term, and my (imperfect) statistics tend to confirm (tentatively) that the market is on a good path.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Let's see how things play out...&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=YQ5Cxn9y2xA:gDTCu1barfc:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=YQ5Cxn9y2xA:gDTCu1barfc:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=YQ5Cxn9y2xA:gDTCu1barfc:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=YQ5Cxn9y2xA:gDTCu1barfc:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=YQ5Cxn9y2xA:gDTCu1barfc:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=YQ5Cxn9y2xA:gDTCu1barfc:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=YQ5Cxn9y2xA:gDTCu1barfc:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/YQ5Cxn9y2xA" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7271497387596000551?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7271497387596000551?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/YQ5Cxn9y2xA/is-mechanical-turk-10-billion-dollar.html" title="How big is Mechanical Turk?" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-0S_olSCGiyA/UKhuTvU1gkI/AAAAAAAAz7Y/YlZ9nwwDOnc/s72-c/mturk-arrivals.PNG" height="72" width="72" /><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/11/is-mechanical-turk-10-billion-dollar.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Dk4DRXc6fip7ImA9WhNRGU8.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6227760719718951398</id><published>2012-11-13T23:31:00.001-05:00</published><updated>2012-11-14T15:29:34.916-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-11-14T15:29:34.916-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><title>Why I love crowdsourcing (the concept) and hate crowdsourcing (the term)</title><content type="html">The term crowdsourcing is in fashion. It is being used to describe pretty much everything under the sun today.&lt;br /&gt;
&lt;br /&gt;
Unfortunately, the word crowdsourcing is also getting increasingly associated with "getting things done for free", or at least at ultra-cheap prices. The "crowd" will generate the content for the website. The "crowd" will fix the mistakes. The "crowd" will do everything, and preferably for "points", for "badges", for a spot on the leaderboard, or may be for a few pennies if we end up using Mechanical Turk.&lt;br /&gt;
&lt;br /&gt;
But this association of the term crowdsourcing with low cost labor, is now visibly turning people off. Everybody wants to "use" the crowd but the workers in the crowd feel stiffed. The &lt;a href="http://www.no-spec.com/" target="_blank"&gt;NoSpec movement&lt;/a&gt;&amp;nbsp;was an early warning. The angry tone of some of the threads in &lt;a href="http://turkernation.com/" target="_blank"&gt;Turker Nation&lt;/a&gt;&amp;nbsp;is also an indication that many workers are not very happy with the way that they are treated by some requesters.&lt;br /&gt;
&lt;br /&gt;
However, these negative associations are now endangering a very important concept: The idea that we can structure tasks in a way that are robust to the presence of imperfect workers, and that &lt;b&gt;anyone can participate, as long as there is work available&lt;/b&gt;. Well-structured tasks &lt;a href="http://www.behind-the-enemy-lines.com/2012/02/crowdsourcing-end-of-job-interviews.html" target="_blank"&gt;allow the on-the-task evaluation of the workers&lt;/a&gt;, and can automatically infer whether someone is a good fit for a task or not.&lt;br /&gt;
&lt;br /&gt;
This is not insignificant. It is well-known that one of the biggest barriers for breaking into the workforce is to have prior relevant experience. Students today &lt;a href="http://www.nytimes.com/2012/05/06/business/unpaid-internships-dont-always-deliver.html?pagewanted=all" target="_blank"&gt;often beg to get unpaid internships&lt;/a&gt;, just to have in their resume the lines with the coveted work experience. In online labor markers, newcomers often bid lower than what they would accept normally, just to build their feedback history. &lt;a href="http://www.behind-the-enemy-lines.com/2012/04/feedback-unemployment-and-crowdsourcing.html" target="_blank"&gt;Crowdsourcing can change that&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
But as long as crowdsourcing gets associated with low wages, nobody will see the real benefit: That work is within reach, immediately. That someone can experiment with different types of work easily (&lt;a href="http://www.collective2.com/" target="_blank"&gt;stock trading&lt;/a&gt;? &lt;a href="http://www.quirky.com/" target="_blank"&gt;product design&lt;/a&gt;?).&lt;br /&gt;
&lt;br /&gt;
Perhaps a new term can describe better the true value of crowdsourcing, and also get the stigmatizing term "crowd" out of the name. (Nobody wants to be part of a "crowd".)&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Personally, I favor the term "&lt;span style="color: #990000;"&gt;open work&lt;/span&gt;"&lt;/b&gt;. As in the case of "open access" and "open source software", it describes the opportunity to access work, without barriers. I also like the "&lt;a href="https://www.mobileworks.com/company/fair-trade-work/" target="_blank"&gt;fair trade work&lt;/a&gt;" motto from MobileWorks but this is more closely connected to work being offered to developing countries. But I think that "open work" captures better the essence of the advantages behind crowdsourcing.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;&lt;u&gt;&lt;i&gt;Update&lt;/i&gt;&lt;/u&gt;&lt;/b&gt;: The term open is indeed also associated with free-as-in-beer &lt;b&gt;&lt;i&gt;consumption&lt;/i&gt;&lt;/b&gt;. However, open can refer both to the supply-side (production) and the demand-side (consumption). For example:&lt;br /&gt;
&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;Linux is open, in the sense that anyone can take the source code, modify it, and contribute back (open production); open source software is also available, often, for free, for installation to any machine (open consumption).&amp;nbsp;&lt;/li&gt;
&lt;li&gt;In publishing, open access typically means accessing papers without paying (open consumption), but there are also journals (e.g., &lt;a href="http://www.plosone.org/" target="_blank"&gt;PLoS ONE&lt;/a&gt;) that accept pretty much any technically-valid paper (open production).&lt;/li&gt;
&lt;/ul&gt;
In the case of crowdsourcing, "open work" would refer mainly to the open production side. As in the production side of open source, and open access publishing, it does not mean that the participants are not paid for the generation of the artifacts.&lt;br /&gt;
&lt;br /&gt;
What do you think?&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=lhHBJlLm53U:4U7pxXjwxr8:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=lhHBJlLm53U:4U7pxXjwxr8:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=lhHBJlLm53U:4U7pxXjwxr8:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=lhHBJlLm53U:4U7pxXjwxr8:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=lhHBJlLm53U:4U7pxXjwxr8:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=lhHBJlLm53U:4U7pxXjwxr8:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=lhHBJlLm53U:4U7pxXjwxr8:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/lhHBJlLm53U" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6227760719718951398?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6227760719718951398?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/lhHBJlLm53U/why-i-love-crowdsourcing-concept-and.html" title="Why I love crowdsourcing (the concept) and hate crowdsourcing (the term)" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/11/why-i-love-crowdsourcing-concept-and.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEUDR344fCp7ImA9WhNTGEo.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6513120653647684166</id><published>2012-10-21T19:07:00.000-04:00</published><updated>2012-10-21T22:17:56.034-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-21T22:17:56.034-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="quality" /><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="get-another-label" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>New version of Get-Another-Label available</title><content type="html">I am often asked what type of technique I use for evaluating the quality of the workers on Mechanical Turk (or on oDesk, or ...). Do I use gold tests? Do I use redundancy?&lt;br /&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Well, the answer is that I use both. In fact, I use the code "Get-Another-Label" that I have developed together with my PhD students and a few other developers. The code is &lt;a href="https://github.com/ipeirotis/Get-Another-Label" target="_blank"&gt;publicly available on Github&lt;/a&gt;.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
We have updated the code recently, to add some useful functionality, such as the ability to pass (for evaluation purposes) the true answers for the different tasks, and get back answers about the quality of the estimates of the different algorithms.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
So, now, if you have a task where the answers are discrete (e.g., "&lt;i&gt;is this comment spam or not?", &lt;/i&gt;or &lt;i&gt;"how many people in the photo? (a) none, (b) 1-2, (c) 3-5, (d) more than 5&lt;/i&gt;", etc) then you can use the Get-Another-Label code, which supports the following:&lt;/div&gt;
&lt;div&gt;
&lt;ul&gt;
&lt;li&gt;Allows any number of discrete categories, not just binary&lt;/li&gt;
&lt;li&gt;Allows the specification of arbitrary misclassification costs (e.g., "marking spam as legitimate has cost 1, marking legitimate content as spam has cost 5")&lt;/li&gt;
&lt;li&gt;Allows for seamless mixing of gold labels and redundant labels for quality control&lt;/li&gt;
&lt;li&gt;Estimates the quality of the workers that participate in your tasks. The metric is normalized to be between 0% for a worker that gives completely random labels, and 100% for a perfect worker.&lt;/li&gt;
&lt;li&gt;Estimates the quality of the data that are returned back by the algorithm. The metric is normalized to be 0% for data that have the same quality as unlabeled data, and 100% for perfectly labeled data.&lt;/li&gt;
&lt;li&gt;Allows the use of evaluation data, that are used to examine the accuracy of the quality control algorithms, both for the data and for the worker quality.&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
Currently, we support the vanilla majority voting, and the expectation-maximization algorithm to combine the labels assigned by the workers. We also support maximum likelihood, minimum cost, and "soft" classification schemes. In most cases, the expectation maximization together with the minimum cost classification approach tend to work best, but you can try it yourself.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
An important side-effect of reporting the estimated quality of the data, is that you can then &lt;b&gt;allocate further labeling resources in the data points that have the highest expected cost&lt;/b&gt;. Jing has done plenty of experiments and has concluded that, in the absence of any other information (e.g., who is the worker who will label the example), it is always best to focus the labeling efforts in the examples with the highest expected cost.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
I expect this version of the code to be the last iteration of the &lt;a href="https://github.com/ipeirotis/Get-Another-Label" target="_blank"&gt;GAL codebase&lt;/a&gt;. In our next step, we will transfer GAL into a web service environment, allowing for streaming, real-time estimation of worker and data quality, and also allowing for continuous labels, supporting quality-sensitive payment estimation, and many other tasks. Stay tuned: &lt;a href="http://www.project-troia.com/" target="_blank"&gt;Project-Troia&lt;/a&gt; is just around the corner.&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Vc2KS87kfpU:N8UTdcfr4Zk:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Vc2KS87kfpU:N8UTdcfr4Zk:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Vc2KS87kfpU:N8UTdcfr4Zk:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Vc2KS87kfpU:N8UTdcfr4Zk:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Vc2KS87kfpU:N8UTdcfr4Zk:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Vc2KS87kfpU:N8UTdcfr4Zk:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Vc2KS87kfpU:N8UTdcfr4Zk:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/Vc2KS87kfpU" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6513120653647684166?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6513120653647684166?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/Vc2KS87kfpU/new-version-of-get-another-label.html" title="New version of Get-Another-Label available" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/10/new-version-of-get-another-label.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkEGRnczcCp7ImA9WhNRGEs.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-808817123173713497</id><published>2012-10-20T13:46:00.001-04:00</published><updated>2012-11-13T22:43:47.988-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-11-13T22:43:47.988-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="market for lemons" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Why oDesk has no scammers</title><content type="html">&lt;div style="text-align: justify;"&gt;
So, in my last blog post, I described a brief outline on how to use oDesk to execute automatically a set of tasks, in a "Mechanical Turk" style (i.e., no interviews for hiring and completely computer-mediated process for posting a job, hiring, and ending a contract).&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
A legitimate question by appeared in the comments:&lt;/div&gt;
&lt;blockquote class="tr_bq" style="text-align: justify;"&gt;
&lt;i&gt;"Well, the concept is certainly interesting. But is there a compelling reason to do microstasks on oDesk? Is it because oDesk has a rating system?&lt;/i&gt;"&lt;/blockquote&gt;
&lt;div style="text-align: justify;"&gt;
So, here is my answer: If you hire contractors on oDesk you will not run into any scammers, even without any quality control. Why is that? Is there a magic ingredient at oDesk? Short answer: Yes, there is an ingredient: &lt;b&gt;Lack of anonymity&lt;/b&gt;!&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
It is a &lt;a href="http://www.www2004.org/proceedings/docs/1p403.pdf" target="_blank"&gt;very&lt;/a&gt; &lt;a href="https://gnunet.org/sites/default/files/Tech%20Report%20-%20A%20Survey%20of%20Solutions%20to%20the%20Sybil%20Attack.pdf" target="_blank"&gt;well&lt;/a&gt;-&lt;a href="http://www.springerlink.com/content/3an0ek5gfan3dtx9/" target="_blank"&gt;known&lt;/a&gt; &lt;a href="http://onlinelibrary.wiley.com/doi/10.1111/j.1430-9134.2001.00173.x/abstract" target="_blank"&gt;fact&lt;/a&gt; that if a marketplace allows anonymous participants and cheap generation of new identities, the marketplace is going to fall victim to malicious participants. There are many examples of markets that allowed anonymity and each generation of pseudonyms, that ultimately became "&lt;a href="http://www.behind-the-enemy-lines.com/2010/07/mechanical-turk-low-wages-and-market.html" target="_blank"&gt;market for lemons&lt;/a&gt;". Unfortunately, when you have cheap identity generation, the reputation system of the marketplace becomes extremely easy to manipulate.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
So, what is different with oDesk? oDesk has contractors that are &lt;i&gt;not anonymous&lt;/i&gt; and their userids are tied (strongly) to a real world identity (onymous?). For example, to withdraw money from oDesk into a bank account, the name in the bank account needs to match the name that listed on oDesk. There are other mechanisms as well for verifying the identify of the contractors (e.g., when I listed myself as a contractor, I had to upload copies of my driving license, copies of my bank statements, etc), but the details of the implementation do not matter. &lt;b&gt;The key element is to make it difficult or costly to create new or false identities.&lt;/b&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
A strong identify verification pretty much eliminates any type of scam. Why? Because the scammers cannot simply shut down their account after being caught scamming and create a new one. Therefore, all the oDesk contractors with 99.9% probability will &lt;i&gt;not&lt;/i&gt; try to scam you. Now, do not get me wrong: you &lt;i&gt;are&lt;/i&gt; going to run into incompetent contractors. But there is a difference between an incompetent contractor and one that deliberately tries to scam you. &lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
As my colleague &lt;a href="http://john-joseph-horton.com/" target="_blank"&gt;John Horton&lt;/a&gt; says: "An incompetent worker who puts some effort in the task is like a bad bus driver: Very slow to take you to your destination but at least you are going towards the correct place, albeit slowly. The scammers are like the unlicensed cab drivers that take you to a random place in order to demand arbitrary fare amounts afterwards to take you to your correct destination".&lt;/div&gt;
&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=x_iszUkJj0s:RA6-7FFaSoc:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=x_iszUkJj0s:RA6-7FFaSoc:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=x_iszUkJj0s:RA6-7FFaSoc:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=x_iszUkJj0s:RA6-7FFaSoc:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=x_iszUkJj0s:RA6-7FFaSoc:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=x_iszUkJj0s:RA6-7FFaSoc:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=x_iszUkJj0s:RA6-7FFaSoc:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/x_iszUkJj0s" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/808817123173713497?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/808817123173713497?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/x_iszUkJj0s/why-odesk-has-no-spammers.html" title="Why oDesk has no scammers" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/10/why-odesk-has-no-spammers.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkQFSHY6fCp7ImA9WhNTF0Q.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-2093537147336349892</id><published>2012-10-14T23:38:00.000-04:00</published><updated>2012-10-21T00:38:39.814-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-10-21T00:38:39.814-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Using oDesk for microtasks</title><content type="html">&lt;div style="text-align: justify;"&gt;
Quite a few people keep asking me about Mechanical Turk. Truth be told, I have not used MTurk for my own work for quite some time. Instead I use oDesk to get workers for my tasks, and, increasingly, for my microtasks as well.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
When I mention that people can use oDesk for micro-tasks, people get often surprised: "oDesk cannot be used through an API, it is designed for human interaction, right?" Oh well, yes and no. Yes, most jobs require some form of interviewing, but there are certainly jobs where you do not need to manually interview a worker before engaging them. In fact, with most crowdsourcing jobs having both the training and the evaluation component built in the working process, the manual interview is often not needed.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
For such crowdsourcing-style jobs, you can use the oDesk API to automate the hiring of workers to work on your tasks. You can find the API at &lt;a href="http://developers.odesk.com/w/page/12364003/FrontPage"&gt;http://developers.odesk.com/w/page/12364003/FrontPage&lt;/a&gt; (Saying that the API page is, ahem, badly designed, is an understatement. Nevertheless, it is possible to figure out how to use it, relatively quickly, so let's move on.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Here are the typical steps for a crowdsourcing-style contract on oDesk:&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;First, post a job: Use the "Post a Job" call from the &lt;a href="http://developers.odesk.com/w/page/23873221/Jobs%20HR%20API" target="_blank"&gt;Jobs API&lt;/a&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Once the job is posted, poll the job openings to find who applied: Use the "List all the offers" call from the &lt;a href="http://developers.odesk.com/w/page/23881180/Offers%20API" target="_blank"&gt;Offers API&lt;/a&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Examine the details of the contractors that bid on the job: Use the "Get Offer" from the Offers API, to examine the details of each contractor. For example, for a task we had to have at most 10 people from a given country. So, the first 10 people from each country were hired, while subsequent applications from a country that had already 10 applicants were denied. Other people may decide not to hire contractors with less than 50 hours of prior work. It seems to be an interesting research topic to intelligently decide what aspects of the contractor matter most for a job, and hire/decline applications based on such info.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Make offers to the contractors:&lt;i&gt;&lt;span style="font-size: x-small;"&gt; [That is the stupid part: Apparently the API does not allow the buyer to simply "accept" the bid by the contractor, although this is trivially possible through the web interface].&lt;/span&gt;&lt;/i&gt; Use the "Post a Job" call, and &lt;i&gt;create a new job opening&lt;/i&gt; for the contractor. Then use the "Make Offer" call from the Offers API, to generate an offer for the contractor(s) that you want to hire.&lt;/li&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;If you &lt;b&gt;do not want to pay per hour&lt;/b&gt;, but rather per task, &lt;b&gt;create an hourly contract&lt;/b&gt;, but &lt;b&gt;set the maximum working hours per week at zero&lt;/b&gt;. Yes, this is not a mistake. You will be using the Custom Payments functionality to effectively submit "bonus payments" to the contractor.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Typically, it is better to have a mixture of both hourly wage and a fixed price component. You can have a no-hourly-wage policy by setting at 0 the maximum hours that can be charged, simulating MTurk. Or you can specify the hourly wage, and set the limit of how many hours can be charged per week.&lt;/li&gt;
&lt;/ul&gt;
&lt;li style="text-align: justify;"&gt;Direct the contractor how to work: For that use the &lt;a href="http://developers.odesk.com/w/page/12364005/Message%20Center%20API" target="_blank"&gt;Message Center API&lt;/a&gt;, to send a message to the contractor, with the URL where you host your task. &lt;i&gt;[Note: oDesk does not provide functionality for handling the task execution, so it is up to you to build that infrastructure. If you have ever built an "external HIT" on MTurk, you are ready to go. Just now you need to send the oDesk workers a url, where they can login to your website, and their username/password. You can go full force and allow an oDesk authentication, but this seems a little bit too much for me.]&lt;/i&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Whenever the contractor has completed enough tasks, use the &lt;a href="http://developers.odesk.com/w/page/25400171/Custom%20Payment%20API" target="_blank"&gt;Custom Payment API&lt;/a&gt; to submit the payment. Repeat as needed.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;When the task is done, end the contract using the &lt;a href="http://developers.odesk.com/w/page/46842954/Contracts%20API" target="_blank"&gt;contracts API&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
That's all folks! In the next few weeks, I will try to post the code for some of the crowdsourcing experiments that we conducted with oDesk.&lt;/div&gt;
&lt;/div&gt;
&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DUblxvfLTy8:hhFIFZsAZj4:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DUblxvfLTy8:hhFIFZsAZj4:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=DUblxvfLTy8:hhFIFZsAZj4:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DUblxvfLTy8:hhFIFZsAZj4:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=DUblxvfLTy8:hhFIFZsAZj4:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DUblxvfLTy8:hhFIFZsAZj4:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DUblxvfLTy8:hhFIFZsAZj4:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/DUblxvfLTy8" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2093537147336349892?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2093537147336349892?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/DUblxvfLTy8/using-odesk-for-microtasks.html" title="Using oDesk for microtasks" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/10/using-odesk-for-microtasks.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DEcARXkyfCp7ImA9WhJQFUs.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6009169204247696184</id><published>2012-07-29T08:29:00.000-04:00</published><updated>2012-07-29T08:47:24.794-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-29T08:47:24.794-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><category scheme="http://www.blogger.com/atom/ns#" term="efficient markets" /><title>The disintermediation of the firm: The feature belongs to individuals</title><content type="html">&lt;div style="text-align: justify;"&gt;&lt;b&gt;My experience with online outsourcing&lt;/b&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;I joined the Stern Business School, back in 2004. In my first couple of year, my research approach was pretty much a continuation of my PhD years: I was doing a lot of coding and experimentation myself. However, at some point I got tired to writing boring code: Crawlers, front-end websites, and other "non-research" pieces of code were not only uninteresting but were also a huge drain of time.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;So, I started experimenting with hiring coders. First locally at NYU. Unfortunately, non-research student coders turned out to be a bad choice. They were not experienced enough to write good code, and were doing this task purely for monetary reasons, not for learning. I got nothing useful out of this. Just expensive pieces of crap code.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;In summer of 2005, I started experimenting with online outsourcing. I tried eLance, Guru, and Rent-A-Coder. I tentatively started posting there programming projects that were not interesting conceptually (e.g., "crawl this website and store the data in a CSV file", "grab the CSV data from that website and put them in a database", "create a demo website that does that", etc) &lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Quickly, I realized that this was a win-win situation: The code was completed quickly, the quality of the websites was much better than what I could prepare myself, and I was free to focus on my research. Once I started getting PhD students, outsourcing non-research coding requirements became a key part of my research approach: PhD time was too valuable to waste on writing crawlers and dealing with HTML parsing peculiarities.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;&lt;b&gt;Seven years of outsourcing: Looking back&lt;/b&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Seven years have passed since my first outsourcing experiments. I thought it is now a good time to look back  and evaluate.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Across all outsourcing sites (excluding Mechanical Turk), I realized that I had posted and hired contractors for a total of 235 projects. Honestly, I was amazed by the number but amortized this is just one project per 10 days, which is reasonably close to my expectations. &lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Given the reasonably large number of projects, I thought that I may be able to do some basic quantitative analysis to figure out what patterns lead to my own, personal satisfaction with the result. I started coding the results, adding variables that were both personal (how much did I care about the project? how detailed were the specs? how much did I want to spend?) and contractor-specific (past history, country of origin, communication while bidding, etc).&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Quickly, even before finished coding, a pattern emerged: All the "exceeded expectations" projects were done by individual contractors or small teams of 2-3 people. All the "disappointments" were with contractors that were employees of a bigger contracting firm.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;In retrospect, it is a matter of incentives:  The employees do not have the incentive to produce to the maximum of their labor power. In contrast, individuals with their own company, produce much closer to their maximum capacity; the contractor-owners are also are connected to the product of their work, and they are better workers overall.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;I would not attribute causality to my observation but rather self-selection: Individuals that are knowledgeable understand that the bigger firm does not have much to offer. In the past, the bigger firm was fulfilling the role of being visible and, therefore, bringing projects; the firm also offers a stable salary but for talented individuals this quickly becomes a stagnating salary. &lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;With the presence of online marketplaces, the need to have a big firm to get jobs started decreasing. Therefore, the talented contractors do not really a bigger firm to bring the work. &lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;The capable individuals disintermediate the firm.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;&lt;b&gt;The emergence of the individual&lt;/b&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Although the phenomenon is still in its infancy, I expect to see the rise of individuals and the emergence of small teams to be an important trend in the next few years. The bigger firms will feel the increase pressure from agile teams of individuals that can operate faster and get things done quicker. Furthermore, talented individuals, knowing that they can find good job prospects online, they will start putting higher pressure on their employers: Either there is a more equitable share of the surplus, or the value-producing individuals will move into their own ventures.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Marx would have been proud: The value-generating labor is now getting into the position of reaping the value of the generated work. Ironically, this "emancipation" is happening through the introduction of capitalist free markets that connect the planet, and not through a communist revolution.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DZxuiTAMHd0:IKdLSlm0_so:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DZxuiTAMHd0:IKdLSlm0_so:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=DZxuiTAMHd0:IKdLSlm0_so:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DZxuiTAMHd0:IKdLSlm0_so:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=DZxuiTAMHd0:IKdLSlm0_so:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DZxuiTAMHd0:IKdLSlm0_so:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=DZxuiTAMHd0:IKdLSlm0_so:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/DZxuiTAMHd0" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6009169204247696184?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6009169204247696184?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/DZxuiTAMHd0/the-disintermediation-of-firm.html" title="The disintermediation of the firm: The feature belongs to individuals" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/the-disintermediation-of-firm.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DU8DQnY-cSp7ImA9WhJXEEw.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-2447525833509884189</id><published>2012-07-20T22:59:00.000-04:00</published><updated>2012-08-03T14:17:53.859-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-08-03T14:17:53.859-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>On Retention Rates</title><content type="html">&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;
&lt;div&gt;
&lt;span style="background-color: white;"&gt;I spent this week in Suncadia, a small resort near Seattle, in the amazing workshop on&amp;nbsp;&lt;/span&gt;&lt;a href="http://www.cs.washington.edu/mssi/2012/" style="background-color: white;" target="_blank"&gt;Crowdsourcing Personalized Education&lt;/a&gt;&lt;span style="background-color: white;"&gt;, organized&lt;/span&gt;&lt;span style="background-color: white;"&gt;&amp;nbsp;by&amp;nbsp;&lt;/span&gt;&lt;a href="http://www.cs.washington.edu/people/faculty/weld" style="background-color: white;"&gt;Dan Weld&lt;/a&gt;&lt;span style="background-color: white;"&gt;,&amp;nbsp;&lt;/span&gt;&lt;a href="http://www.cs.washington.edu/mssi/2012/www.cs.washington.edu/people/faculty/mausam" style="background-color: white;"&gt;Mausam&lt;/a&gt;&lt;span style="background-color: white;"&gt;,&amp;nbsp;&lt;/span&gt;&lt;a href="http://research.microsoft.com/~horvitz/" style="background-color: white;"&gt;Eric Horvitz&lt;/a&gt;&lt;span style="background-color: white;"&gt;, and&amp;nbsp;&lt;/span&gt;&lt;a href="http://research.microsoft.com/en-us/um/people/merrie/" style="background-color: white;"&gt;Meredith Ringel Morris&lt;/a&gt;&lt;span style="background-color: white;"&gt;. (The &amp;nbsp;slides should be available online soon.) It was an amazing workshop, and the quality of the projects that were presented was simply exceptional.&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Beyond the main topic of the workshop (online education and crowdsourcing), I noticed one measure of success being mentioned by multiple projects:&amp;nbsp;&lt;b&gt;Retention&lt;/b&gt;.&amp;nbsp;&lt;span style="background-color: white;"&gt;Retention is typically defined as the number of users that remain active, compared to the total number of users.&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: -webkit-auto;"&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
How exactly to define retention is a tricky issue. What is the definition of an "active" user? What is the total number of users? You can manipulate the number to give you back something that looks nice.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
For example, for online courses many people list the number of registered participants as users (e.g,&amp;nbsp;&lt;a href="http://www.nytimes.com/2012/04/18/technology/coursera-plans-to-announce-university-partners-for-online-classes.html" target="_blank"&gt;160,000 students&lt;/a&gt;&amp;nbsp;enrolled for the AI class). Then, if you take the 22,000 students that graduated as active, you get a retention rate of 13.75%.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Of course, if you want to make the number higher (13.75% seems low) you can just change the definition of what counts as user (e.g., "watched at least one video") and decrease the denominator, or change the definition of active user (e.g., "submitted an assignment") and increase the nominator.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
A relatively common definition is number of users that come back at least once a week, divided by the number of users registered in that time period. At least Duolingo, Foldit, and a few other projects seemed to have a similar definition.&amp;nbsp;&lt;span style="background-color: white;"&gt;With this definition, a number of 20% and above is typically considered successful, as this was also noted to be the retention rate for Twitter.&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;b&gt;How to measure retention in online labor?&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;&lt;br /&gt;
So, I started wondering what is the appropriate measure of retention for online labor sites. The "come back" at least once every week" is a weak one. We need people to engage with the available tasks.&lt;br /&gt;
&lt;br /&gt;
One idea is to measure percentage of users that earn &amp;gt;X dollars per week. To avoid comparing workers with different lifetimes, it is a good practice to compare users that started at the same time (e.g., the "May 2012 cohort") and see the retention rates stratified by cohort.&amp;nbsp;The problem with the previous metric is that you need to examine it &amp;nbsp;not only for different cohorts but also for different values of X.&lt;br /&gt;
&lt;br /&gt;
An alternative approach is to examine the "hour worked per week". In that case, we need to examine what percentage of the 40-hour working week is captured by the labor site.&lt;br /&gt;
&lt;br /&gt;
Say that we have 500,000 registered users and we observe that at any given time we have 5,000 of them active on the site. (These are commonly quoted numbers for Mechanical Turk.) What this 1% activity mean?&lt;br /&gt;
&lt;br /&gt;
First, we need to see what a good comparative metric. Suppose that full success is that all 500,000 workers come and work full time. In that case, we can expect an average activity level of (40*50)/(24*365)=22.8% (40 is the total working hours in a week, 50 is the working weeks in a year, and 24*365 is the total number of hours in a year). So an average of 22.8% of activity is the maximum attainable; to keep things simpler, we can say that seeing on average 20% of the users working on the site is perfect.&lt;br /&gt;
&lt;br /&gt;
So, if a site has an average of 1% activity, it is not as bad as it sounds. It means that 1 out of 20 registered users actually work full time on the site.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=2y0bfcA_jGQ:IPgbnc_XSmw:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=2y0bfcA_jGQ:IPgbnc_XSmw:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=2y0bfcA_jGQ:IPgbnc_XSmw:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=2y0bfcA_jGQ:IPgbnc_XSmw:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=2y0bfcA_jGQ:IPgbnc_XSmw:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=2y0bfcA_jGQ:IPgbnc_XSmw:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=2y0bfcA_jGQ:IPgbnc_XSmw:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/2y0bfcA_jGQ" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2447525833509884189?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2447525833509884189?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/2y0bfcA_jGQ/on-retention-rates.html" title="On Retention Rates" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/on-retention-rates.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEQMSHczeip7ImA9WhNXFks.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-331320887937320166</id><published>2012-07-13T16:59:00.000-04:00</published><updated>2012-12-04T18:06:29.982-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-12-04T18:06:29.982-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="humor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Why is oDesk called oDesk?</title><content type="html">[This post has been removed after a request from a company. You will need to stay in the dark until the Singularity.]&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=L3dUDwRbn_A:AEB0hGpLp1s:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=L3dUDwRbn_A:AEB0hGpLp1s:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=L3dUDwRbn_A:AEB0hGpLp1s:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=L3dUDwRbn_A:AEB0hGpLp1s:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=L3dUDwRbn_A:AEB0hGpLp1s:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=L3dUDwRbn_A:AEB0hGpLp1s:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=L3dUDwRbn_A:AEB0hGpLp1s:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/L3dUDwRbn_A" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/331320887937320166?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/331320887937320166?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/L3dUDwRbn_A/why-odesk-is-called-odesk.html" title="Why is oDesk called oDesk?" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/why-odesk-is-called-odesk.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEMFQHo-eip7ImA9WhJSGEU.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-7106087953624743799</id><published>2012-07-09T20:59:00.000-04:00</published><updated>2012-07-09T21:06:51.452-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-09T21:06:51.452-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="quality" /><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Discussion on Disintermediating a Labor Channel</title><content type="html">&lt;div style="text-align: justify;"&gt;Last Friday, I wrote a short blog post with the title "&lt;a href="http://www.behind-the-enemy-lines.com/2012/07/disintermediating-labor-channel-does-it.html"&gt;Disintermediating a Labor Channel: Does it Make Sense?&lt;/a&gt;" where I argued that trying to bypass a labor channel (Mechanical Turk, oDesk, etc) in order to save on the extra fees does not make much sense.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Despite the fact that there was no discussion in the comments, that piece seemed to generate a significant amount of feedback, across various semi-private channels (fb/plus/twitter) and in many real-life discussions&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Fernando Pereira wrote on Google Plus:&lt;/div&gt;&lt;blockquote class="tr_bq" style="text-align: justify;"&gt;Your argument sounds right, but I'm wondering about quality: can I control quality/biases in the outside labor platform? How do I specify labor platform requirements to meet &lt;b&gt;&lt;i&gt;my &lt;/i&gt;&lt;/b&gt;requirements? It could be different from quality control for outsourced widgets because outsourced labor units might be interdependent, and thus susceptible to unwanted correlation between workers.?&lt;br /&gt;
&lt;/blockquote&gt;&lt;div style="text-align: justify;"&gt;Another friend wrote in my email:&lt;/div&gt;&lt;blockquote class="tr_bq" style="text-align: justify;"&gt;So, do you advocate that oDesk should be controlling the process? Actually, I'd rather have higher control over my employees and know who is doing what.&lt;br /&gt;
&lt;/blockquote&gt;&lt;div style="text-align: justify;"&gt;Both questions have similar flavor, and it indicates that I failed in expressing my true thoughts on the issue.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;I do &lt;b&gt;&lt;i&gt;not &lt;/i&gt;&lt;/b&gt;advocate giving up control of the "human computation" process. I advocate in letting a third-party platform handle the "low level" recruiting and payment of the workers, preferably through an API-fied process. Payments, money transfer regulations, and immigration are big tasks that are best handled by specialized platforms. They are too much for most other companies. Handling such things on your own is as interesting as handling issues like aircondition, electricity supply, and failed disks and motherboards when you are building a software application: Let someone else do these things for you.&lt;/div&gt;&lt;br /&gt;
&lt;hr style="background-color: #691f01; border-bottom-width: 1px; border-color: initial; border-left-width: 1px; border-right-width: 1px; border-style: initial; border-top-width: 1px; color: #691f01; display: block; height: 2px; text-align: justify;" width="50%" /&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;One useful classification that I think will clarify further my argument. Consider the different "service models" for crowdsourcing, which &lt;a href="http://www.behind-the-enemy-lines.com/2012/03/unofficial-nist-definition-of-cloud.html" target="_blank"&gt;I have adapted from the NIST definition of cloud services&lt;/a&gt;.&lt;/div&gt;&lt;blockquote class="tr_bq"&gt;&lt;ul&gt;&lt;li style="text-align: justify;"&gt;&lt;b&gt;Labor Applications/Software as a Service (LSaaS)&lt;/b&gt;. The capability provided to the client is to use the provider’s applications running on a cloud-labor infrastructure. [...] The client does not manage or control the underlying cloud labor, with the possible exception of limited user-specific application configuration settings. Effectively, the client only cares about the quality of the provided results of the labor and does not want to know about the underlying workflows, quality management, etc. [Companies like &lt;a href="http://castingwords.com/" target="_blank"&gt;CastingWords&lt;/a&gt; and &lt;a href="http://www.utest.com/" target="_blank"&gt;uTest&lt;/a&gt; fall into this category: They offer a vertical service, which is powered by the crowd, but the end client typically only cares about the result]&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Labor Platform as a Service (LPaaS)&lt;/b&gt;. The capability provided to the client is to deploy onto the labor pool consumer-created or acquired applications created using programming languages and tools supported by the provider. The client does not manage or control the underlying labor pool, but has control of the overall task execution, including workflows, quality control, etc. The platform provides the necessary infrastructure to support the generation and implementation of the task execution logic. [Companies like &lt;a href="https://gethumanoid.com/" target="_blank"&gt;Humanoid&lt;/a&gt; fall into this category: Creating a platform for other people to build their crowd-powered services on top.]&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Labor Infrastructure as a Service (LIaaS)&lt;/b&gt;. The capability provided to the client is to provision labor for the client, who then allocates workers to tasks. The consumer of labor services does not get involved with the recruiting process or the details of payment, but has full control everything else. Much like the Amazon Web Services approach (use EC2, S3, RDS, etc. to build your app), the service provider just provides raw labor and guarantees that the labor force satisfies a particular SLA (e.g., response time within X minutes, has the skills that are advertised in the resume, etc) [Companies like &lt;a href="https://www.mturk.com/mturk/welcome" target="_blank"&gt;Amazon Mechanical Turk&lt;/a&gt;, &lt;a href="https://www.odesk.com/" target="_blank"&gt;oDesk&lt;/a&gt;, etc. fall into this category] &lt;/li&gt;
&lt;/ul&gt;&lt;/blockquote&gt;&lt;div style="text-align: justify;"&gt;From these definitions, I believe that it does not make sense to build your own "infrastructure" if you are going to rely on remote workers.&lt;span style="font-size: x-small;"&gt; (I have a very different attitude for creating an in-house, &lt;i&gt;&lt;b&gt;local&lt;/b&gt;&lt;/i&gt;, team of workers that provides the labor, but this gets very close to being a traditional temp agency, so I do not treat this as crowdsourcing.)&lt;/span&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;I have no formed opinion on the "platform as a service" or a "software as a service" model (yet). &lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;For the software as a service model, I think it is up to you to decide whether you like the output of the system (transcription, software testing, etc). The crowdsourcing part is truly secondary.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;For the platform as a service model, I do not have enough experience with existing offerings to know whether to trust the quality assurance scheme. &lt;span style="font-size: x-small;"&gt;(Usual cognitive bias of liking-best-what-you-built-yourself applies here.)&lt;/span&gt; Perhaps in a couple of years, it would make no sense to build your own quality assurance scheme. But at this point, I think that we are all still relying on bespoke, custom-made schemes, with no good argument to trust a standardized solution offered by a third-party.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=jUaqBtpuCMA:pTfEVjVcpBE:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=jUaqBtpuCMA:pTfEVjVcpBE:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=jUaqBtpuCMA:pTfEVjVcpBE:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=jUaqBtpuCMA:pTfEVjVcpBE:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=jUaqBtpuCMA:pTfEVjVcpBE:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=jUaqBtpuCMA:pTfEVjVcpBE:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=jUaqBtpuCMA:pTfEVjVcpBE:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/jUaqBtpuCMA" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7106087953624743799?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7106087953624743799?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/jUaqBtpuCMA/discussion-on-disintermediating-labor.html" title="Discussion on Disintermediating a Labor Channel" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/discussion-on-disintermediating-labor.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkICQng-fSp7ImA9WhJSFkQ.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-5973618259325710404</id><published>2012-07-06T21:03:00.000-04:00</published><updated>2012-07-07T18:02:43.655-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-07T18:02:43.655-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="amazon" /><category scheme="http://www.blogger.com/atom/ns#" term="advice" /><category scheme="http://www.blogger.com/atom/ns#" term="cloud computing" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Disintermediating a Labor Channel: Does it Make Sense?</title><content type="html">&lt;div style="text-align: justify;"&gt;
Over the years, I have talked with plenty of startups on building crowdsourcing services and platforms. Mechanical Turk (for the majority) and oDesk (for the cooler kids :-p) are common choices for recruiting workers. &lt;a href="http://www.behind-the-enemy-lines.com/2012/02/mturk-vs-odesk-my-experiences.html" target="_blank"&gt;(For a comparison of the two, based on my personal experiences, look here&lt;/a&gt;.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
A common aspiration of many startups is to be able to build their own labor force and channel. Through Facebook, through cell phone, through ads, everyone wants to have direct control of the labor.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;b&gt;My reaction: This is stupid!&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;
&lt;i&gt;&lt;span style="font-size: x-small;"&gt;(Usual disclaimer that I work for oDesk for this year, etc., applies here, but I will stand behind my opinion even without any relationship to any labor marketplace.)&lt;/span&gt;&lt;/i&gt;&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
A very short-sighted reason for this is cost savings: oDesk and Mechanical Turk have a 10% fee. Therefore by disintermediating the labor platform, the company can save 10% of the labor cost. Well, to immediately make the adjustment, y&lt;span style="background-color: white;"&gt;ou do not save 10%. You save maximum 7%. The other 3% is the fee that will be taken by the payment channel (credit card, paypal, etc). The fact that the cost is borne by the worker when using Paypal is not true savings. For foreign workers, you also have a 1%-2% hit when using Paypal or a credit card, which goes on top of the best FX exchange rates. Add extra overhead to handle fraud, mistakes, and other things-that-happen, and the true savings are at most 5% to 6%.&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
But even 5%, isn't that something worth saving? No.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Problem #1: &amp;nbsp;If you are a small startup&lt;/b&gt;, saving 5% in labor costs should not be the goal. Just the cost of developing, managing, and handling complaints about payment is going to cost much more of development time than the corresponding savings. Creating a payment network is typically not at the core of a crowdsourcing startup, and it should not be. Let others deal with the payment and build your product.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Problem #2: If you are a bigger company&lt;/b&gt;, saving 5% in labor costs may be more important. However, if we are talking about labor, then bigger companies start hitting compliance issues. Handling money laundering regulations, handling IRS regulations, and many other HR-related aspects are typically worth the 5% extra. Who wants to be in the HR business if they have a product that is doing something else?&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, why people still obsess about this? Why everyone wants to build its own labor platform?&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Well, because VC's ask for this. "&lt;i&gt;If you are building on top of MTurk/oDesk/whatever, what is your competitive advantage? What prevents others from duplicating what you have done?&lt;/i&gt;"&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The knee-jerk reaction to this demand from VC's is to build a bespoke labor network. Which works fine, as long as you are talking about a relatively-small sized network. Once the size of the labor force becomes bigger, then other problems appear: Identity verification, compliance, regulations, immigration, are all tasks that are time consuming. (Especially when dealing with foreign contractors.) And they are never tasks that add value to the company. They are all pure overhead and solving such issues is absolutely non-trivial.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Do you think it is accidental that Amazon does not pay in cash the MTurk contractors outside India and US? Having seen from the inside at oDesk what is the overhead to build reliable and compliant solutions for handling international payments, I can easily say: Stay away, this is not something you want to do &lt;i&gt;at scale, &lt;/i&gt;having to deal with bureaucrats from all different countries around the world.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;The parallel with building your own data centers vs getting computing resources from the cloud is direct and should be evident&lt;/b&gt;. Unless there is a very good reason to handle your own machines (and space, and aircondition, and handling electrical failures over the summer, etc etc), you just build your infrastructure using the cloud. &amp;nbsp;&lt;span style="background-color: white;"&gt;Same thing with labor.&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Allocating resources to handle overhead tasks, is taking aware resources from the main goal: Building a better product! Let others take care of infrastructural issues.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=VPwfqpMzDEI:q9-ThqFbQBo:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=VPwfqpMzDEI:q9-ThqFbQBo:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=VPwfqpMzDEI:q9-ThqFbQBo:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=VPwfqpMzDEI:q9-ThqFbQBo:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=VPwfqpMzDEI:q9-ThqFbQBo:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=VPwfqpMzDEI:q9-ThqFbQBo:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=VPwfqpMzDEI:q9-ThqFbQBo:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/VPwfqpMzDEI" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/5973618259325710404?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/5973618259325710404?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/VPwfqpMzDEI/disintermediating-labor-channel-does-it.html" title="Disintermediating a Labor Channel: Does it Make Sense?" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/disintermediating-labor-channel-does-it.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CUMMQH8zeSp7ImA9WhJSEkg.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-8144257333915395071</id><published>2012-07-02T08:00:00.000-04:00</published><updated>2012-07-02T14:24:41.181-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-02T14:24:41.181-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Visualizations of the oDesk "oConomy"</title><content type="html">&lt;em&gt;[&lt;a href="https://www.odesk.com/blog/2012/07/visualizationsoftheoconomy/" target="_blank"&gt;Crossposted from the oDesk Blog&lt;/a&gt;. Blog post written together with&amp;nbsp;&lt;a href="https://sites.google.com/site/johnjosephhorton/" target="_blank" title="John J. Horton"&gt;John Horton&lt;/a&gt;.]&lt;/em&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
A favorite pastime of the &lt;a href="http://research.odesk.com/" target="_blank" title="oDesk Research"&gt;oDesk Research Team&lt;/a&gt; is to run analyses using data from oDesk’s database in order to provide a better understanding of oDesk’s online workplace and the way the world works. Some of these analyses were so interesting we started sharing them with the &lt;a href="http://news.ycombinator.com/item?id=3794600" target="_blank"&gt;general&lt;/a&gt; &lt;a href="http://news.ycombinator.com/item?id=3609445" target="_blank"&gt;public&lt;/a&gt;, and &lt;a href="http://onlinelabor.blogspot.com/2012/02/high-wage-skills-on-odesk-or-why-you.html" target="_blank"&gt;posted them online&lt;/a&gt; &lt;a href="http://www.behind-the-enemy-lines.com/2012/04/when-is-world-working-odesk-edition-or.html" target="_blank"&gt;for the world&lt;/a&gt; &lt;a href="http://www.behind-the-enemy-lines.com/2012/05/emergence-of-teams-in-online-work.html" target="_blank"&gt;to see&lt;/a&gt;.&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
Deep inside, however, we were not happy with our current approach. All our analyses and plots were static. We wanted to share something more interactive, using one of the newer javascript-based visualization packages. So, we posted a job on oDesk looking for d3.js developers and found &lt;a href="https://www.odesk.com/users/Javascript-and-Viking_~~3855ac374aa9a2ed" target="_blank"&gt;Zack Meril&lt;/a&gt;, a tremendously &lt;a href="http://zacharymaril.com/" target="_blank"&gt;talented&lt;/a&gt; Javascript developer. Zack took our ideas and built a great tool for everyone to use:&lt;/div&gt;
&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;strong&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/" target="_blank" title="The oDesk Country Dashboard"&gt;The oDesk Country Dashboard&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard"&gt;&lt;img alt="The oDesk Country Dashboard" class="aligncenter size-medium wp-image-25426" height="317" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/frontpage-480x317.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
This dashboard allows you to interactively explore the world of work based upon oDesk’s data. We list below some of our favorite discoveries from playing with its visualizations. Do let us know if you find something interesting. Note that the tool supports “deep linking,” which means that the URL in your address bar fully encodes the view that you see.&lt;/div&gt;
&lt;br /&gt;
&lt;strong&gt;Visualization #1: Global Activity &lt;/strong&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
The first interactive visualization shows the level of contractor activity of different countries across different days of the week and times of day. The pattern seems pretty “expected”:&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/63" rel="attachment wp-att-25377" target="_blank"&gt;&lt;img alt="" class="aligncenter size-medium wp-image-25377" height="186" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/weekly-activity-480x186.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
On a second thought, though, we started wondering. Why do we see such regularity? The x-axis is GMT time. Given that oDesk is a global marketplace, shouldn’t the contractor activity to be smoother? Furthermore, oDesk has a relatively smaller number of contractors from Western Europe, so it seems kind of strange that our contractor community generally follows the waking and sleeping patterns of UK. Investigating closer, if you hover around the visualization, you see a closer look at what contractors are doing throughout the world:&lt;/div&gt;
&lt;br /&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/56" target="_blank" title="Activity Map for Wed, 8:00-9:00 GMT"&gt;At 8am GMT on Wednesday morning&lt;/a&gt;: Russia, India, and China are awake and their activity is increasing.&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/56" rel="attachment wp-att-25390" target="_blank" title="Activity Map for Wed, 8:00-9:00 GMT"&gt;&lt;img alt="" class="size-medium wp-image-25390 aligncenter" height="319" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/watch-wed-8am-480x319.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
As we move towards &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/63" target="_blank" title="Activity Map for Wed, 15:00-16:00 GMT"&gt;the peak of the global activity at 3pm&lt;/a&gt;, the activity of the Asian countries has already started declining. However, at the same time North and Latin America start waking up, compensating for the decrease in activity in Asia, and leading to the world peak.&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/63" target="_blank" title="Activity Map for Wed, 15:00-16:00 GMT"&gt;&lt;img alt="" class="size-medium wp-image-25391 aligncenter" height="322" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/watch-wed-3pm-480x322.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
After 4pm GMT, Asia starts going to sleep, and the activity decreases. The activity continues to decline as America signs off, &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/76" target="_blank" title="Activity Map for Thu, 4:00-5:00 GMT"&gt;hitting the low point of activity at 4am GMT&lt;/a&gt; (but notice how China, Philippines, and Australia start getting active, preventing the activity level from going to zero).&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/watch/false/76" target="_blank" title="Activity Map for Thu, 4:00-5:00 GMT"&gt;&lt;img alt="" class="size-medium wp-image-25392 aligncenter" height="318" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/watch-thu-4am-480x318.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;strong&gt;Visualization #2: Country-Specific Activity&lt;/strong&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
A few weeks back, we also wrote about the rather &lt;a href="http://www.behind-the-enemy-lines.com/2012/04/when-is-world-working-odesk-edition-or.html" target="_blank"&gt;unusual working pattern of Philippines&lt;/a&gt;: contractors from the Philippines tend to keep a schedule that mostly follows U.S. working hours, rather than a “normal” 9-5 day. Since then, we realized that the Philippines is not the only country following this pattern. For example, &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Bangladesh/Philippines/Indonesia/" target="_blank" title="Activity patterns for Philippines, Indonesia, and Bangladesh"&gt;Bangladesh and Indonesia&lt;/a&gt; have similar activity patterns to Philippines. So, we thought, why not make it easy to explore and find working patterns. They reveal something about the culture, habits, and even type of work that gets done in these countries. A few findings of interest:&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Bolivia/" target="_blank" title="Activity Pattern for Bolivia"&gt;Bolivia goes for lunch from 1pm-2pm and then comes back to work.&lt;/a&gt;&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Bolivia/" rel="attachment wp-att-25362" target="_blank" title="Work Activity Pattern for Bolivia"&gt;&lt;img alt="Work Activity Pattern for Bolivia" class="aligncenter size-medium wp-image-25362" height="171" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/activity-bolivia-480x171.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Spain/" target="_blank" title="Activity Pattern for Spain"&gt;Spain either goes for lunch a little later, or Spaniards have a siesta between 2pm-4pm.&lt;/a&gt;&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Spain/" target="_blank" title="Work Activity Pattern for Spain"&gt;&lt;img alt="Work Activity Pattern for Spain" class="aligncenter size-medium wp-image-25365" height="173" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/activity-spain-480x173.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Israel/" target="_blank" title="Activity Pattern for Israel"&gt;Israel has its weekend on Friday and Saturday, with activity going up on Sunday&lt;/a&gt;.&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Israel/" target="_blank" title="Activity Pattern for Israel"&gt;&lt;img alt="Activity Pattern for Israel" class="aligncenter size-medium wp-image-25364" height="175" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/activity-israel-480x175.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;br /&gt;
&lt;strong&gt;Visualization #3: Work Type By Country&lt;/strong&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
Finally, we wondered “What are the factors that influence these working patterns?” Why do some culturally similar countries have very similar working patterns (e.g., &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Russia/Ukraine/" target="_blank"&gt;Russia and Ukraine&lt;/a&gt;), while others have very different patterns (e.g., &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false//India/Bangladesh/" target="_blank"&gt;Pakistan, Bangladesh, and India&lt;/a&gt;)? So, with our third visualization we examine types of work completed on oDesk broken down by country. We used the &lt;a href="http://mbostock.github.com/d3/ex/bubble.html" target="_blank"&gt;bubble chart from d3.js&lt;/a&gt; to visualize the results. Here is, for example, the breakdown for U.S.:&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/United States" rel="attachment wp-att-25373" target="_blank"&gt;&lt;img alt="" class="aligncenter size-medium wp-image-25373" height="414" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-USA-480x414.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
U.S. contractors are mainly working in tasks related to writing. We do see many clients explicitly limit their search for writing contractors to U.S.-based only, both for English proficiency but also (and perhaps more importantly) for the cultural affinity of the writers to their audience. Take a look at Russia: Almost all the &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Russia" target="_blank"&gt;work done in Russia&lt;/a&gt; is Web programming and design, followed by mobile and desktop development.&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Russia" target="_blank" title="Types of projects completed on oDesk by Russian contractors"&gt;&lt;img alt="" class="aligncenter size-medium wp-image-25371" height="362" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Russia-480x362.png" width="480" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
At the opposite end is the &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Philippines" target="_blank"&gt;Philippines&lt;/a&gt;, where few programming tasks are being completed, but significant amounts of data entry, graphic design, and virtual assistant work happen:&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Philippines" target="_blank" title="Types of projects completed on oDesk by Filipino contractors"&gt;&lt;img alt="" class="aligncenter  wp-image-25369" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Philippines-480x380.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
Another interesting example is &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Kenya" target="_blank"&gt;Kenya&lt;/a&gt;. As you can see, most of the work done there (and there is a significant amount of work done in Kenya) is about blog and article writing:&lt;/div&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Kenya" rel="attachment wp-att-25368" target="_blank" title="Types of projects completed on oDesk by Kenyan contractors"&gt;&lt;img alt="" class="aligncenter  wp-image-25368" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Kenya-480x367.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;strong&gt;Exploring Further: Activity Patterns and Types of Projects &lt;/strong&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
One pattern that was not directly obvious was the correlation between activity patterns and type of work. Countries that are engaging mainly in computer programming tend to have a larger fraction of users that use oDesk. For example, see the similarity in the activity patterns of &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Russia/Bolivia/Poland///Ukraine/" target="_blank"&gt;Bolivia, Poland, Russia, and Ukraine&lt;/a&gt;: and the corresponding project types that get completed in these countries:&lt;/div&gt;
&lt;br /&gt;
&lt;table class="aligncenter"&gt;&lt;tbody&gt;
&lt;tr&gt;  &lt;td&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Russia" target="_blank"&gt;&lt;img alt="" class="aligncenter  wp-image-25371" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Russia-150x150.png" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Russia" target="_blank"&gt;Russia&lt;/a&gt;&lt;/div&gt;
&lt;/td&gt;  &lt;td&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Bolivia" target="_blank"&gt;&lt;img alt="" class="aligncenter size-thumbnail wp-image-25366" height="150" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-.Bolivia-150x150.png" width="150" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Bolivia" target="_blank"&gt;Bolivia&lt;/a&gt;&lt;/div&gt;
&lt;/td&gt;  &lt;td&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Ukraine" target="_blank"&gt;&lt;img alt="" class="aligncenter size-thumbnail wp-image-25372" height="150" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Ukraine-150x150.png" width="150" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Ukraine" target="_blank"&gt;Ukraine&lt;/a&gt;&lt;/div&gt;
&lt;/td&gt;  &lt;td&gt;&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Poland" target="_blank"&gt;&lt;img alt="" class="aligncenter  wp-image-25370" src="https://www.odesk.com/blog/wp-content/uploads/2012/06/bubble-chart-Poland-150x150.png" /&gt;&lt;/a&gt;&lt;br /&gt;
&lt;div dir="ltr" style="text-align: center;"&gt;
&lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/bubble/Poland" target="_blank"&gt;Poland&lt;/a&gt;&lt;/div&gt;
&lt;/td&gt;  &lt;/tr&gt;
&lt;/tbody&gt; &lt;/table&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
We should note however that the opposite does not hold: There are other countries that have similar activity patterns and high degree of contractor stickiness (e.g., &lt;a href="http://research.odesk.com/visualizations/country-dashboard/#/compare/false/Argentina/Armenia/Uruguay/China/Belarus/Venezuela/" target="_blank"&gt;Argentina, Armenia, Bolivia, Belarus, China, Uruguay, and Venezuela&lt;/a&gt;) that have rather different project completion dates.&lt;br /&gt;
&lt;strong style="background-color: white;"&gt;&lt;br /&gt;&lt;/strong&gt;&lt;br /&gt;
&lt;strong style="background-color: white;"&gt;Source available on Github&lt;/strong&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;One thing that attracted me to spend my sabbatical at oDesk was the fact that oDesk has been pretty open with its data from the beginning. To this end, you will notice that the Country Explorer is an open source project, so you are welcome to &lt;/span&gt;&lt;a href="https://github.com/johnjosephhorton/gg2d3/tree/master/country-dashboard" style="background-color: white;"&gt;just fork us on Github&lt;/a&gt;&lt;span style="background-color: white;"&gt; and get the code for the visualizations.&lt;/span&gt;&lt;br /&gt;
&lt;strong style="background-color: white;"&gt;&lt;br /&gt;&lt;/strong&gt;&lt;br /&gt;
&lt;strong style="background-color: white;"&gt;New ideas and visualizations&lt;/strong&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;I am thinking of what other types of graphs would be interesting to create. Supply and demand of skills? Asking prices and transaction prices of contractors across countries and across skills? Of course, if you have specific ideas you’d like to see us work on, tell us in the comments! Happy to explore directions and data that you are interested in exploring.&lt;/span&gt;&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=zk4aJ62UQFU:u6Ucj_6Xrhg:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=zk4aJ62UQFU:u6Ucj_6Xrhg:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=zk4aJ62UQFU:u6Ucj_6Xrhg:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=zk4aJ62UQFU:u6Ucj_6Xrhg:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=zk4aJ62UQFU:u6Ucj_6Xrhg:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=zk4aJ62UQFU:u6Ucj_6Xrhg:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=zk4aJ62UQFU:u6Ucj_6Xrhg:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/zk4aJ62UQFU" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/8144257333915395071?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/8144257333915395071?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/zk4aJ62UQFU/visualizations-of-odesk-oconomy.html" title="Visualizations of the oDesk &quot;oConomy&quot;" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><georss:featurename>oDesk</georss:featurename><georss:point>37.488364 -122.224906</georss:point><georss:box>37.486788999999995 -122.2273735 37.489939 -122.22243850000001</georss:box><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/07/visualizations-of-odesk-oconomy.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0MESH8yeSp7ImA9WhJTE0w.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6320597158018161517</id><published>2012-06-21T16:43:00.000-04:00</published><updated>2012-06-21T16:43:29.191-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-06-21T16:43:29.191-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><category scheme="http://www.blogger.com/atom/ns#" term="large datasets" /><title>The oDesk Flower: Playing with Visualizations</title><content type="html">In the few couple of weeks, while at oDesk, I am trying to learn the data stored in the database, and I create random plots to understand what is happening in the market.&amp;nbsp;My absolutely favorite source of data is the data about the micro-level activity of the workers (when they work, how much they type, how much they move the mouse, etc.).&lt;br /&gt;
&lt;br /&gt;
A few weeks back, I posted a blog about the activity levels of different countries, with the basic observation that the activity in Philippines fluctuates much less within the 24-hr day compared to all other countries.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-EFFS0_KafCE/T8AC1F-ZLiI/AAAAAAAAxSE/H1YG5r-8xmk/s1600/odesk-active-workers-over-time.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="176" src="http://4.bp.blogspot.com/-EFFS0_KafCE/T8AC1F-ZLiI/AAAAAAAAxSE/H1YG5r-8xmk/s400/odesk-active-workers-over-time.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;b&gt;You are doing it wrong: The use of radar plots&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
After I posted that plot, I received the following email:&lt;br /&gt;
&lt;blockquote style="background-color: #eeeeee; border-radius: 2px; padding: 5px 5px 5px 5px;"&gt;This is periodic data,&amp;nbsp;which means &lt;a href="http://en.wikipedia.org/wiki/Modular_arithmetic"&gt;modular&lt;/a&gt; thinking. When you visualize periodic data using a linear plot, you necessarily have a cutting point for the x-axis, which can affect the perception of various trends in the data. You should use something similar to the &lt;a href="http://hint.fm/projects/flickr/" target="_blank"&gt;Flickr Flow&lt;/a&gt;, e.g a radar plot in Excel.&lt;/blockquote&gt;So, following the advice of people that really understand visualization, I transformed the activity plot into a radar plot, (in Excel):&lt;br /&gt;
&lt;br /&gt;
&lt;table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style="text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-7IdqJPsSjnM/T8AC1r1hoMI/AAAAAAAAxSM/TtfzJwOtYAc/s1600/odesk-flower.PNG" imageanchor="1" style="margin-left: auto; margin-right: auto;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-7IdqJPsSjnM/T8AC1r1hoMI/AAAAAAAAxSM/TtfzJwOtYAc/s640/odesk-flower.PNG" width="600" /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class="tr-caption" style="text-align: center;"&gt;The oDesk Flower&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;&lt;br /&gt;
As you can see, indeed the comment was correct. Given the periodicity of the data, having a cyclical display is better than having a single horizontal line display. Beautiful to look at? Check. I called this visualization "The oDesk Flower" :-)&lt;br /&gt;
&lt;br /&gt;
Unfortunately, it is not truly informative due to the huge number of countries in the plot. But I think it works well to give the global pace of activity over the week and across countries.&lt;br /&gt;
&lt;br /&gt;
One thing that I did not like in this plot was the fact that I could not really compare the level of activity from one country to other. So, I normalized the values to be the percentage of contractors from that country that are active. A new flower emerged:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-ESm8DI8Pxus/T8AHV09GW1I/AAAAAAAAxSY/x1a24Fk8i7I/s1600/odesk-flower-2.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="600" src="http://3.bp.blogspot.com/-ESm8DI8Pxus/T8AHV09GW1I/AAAAAAAAxSY/x1a24Fk8i7I/s640/odesk-flower-2.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
For comparison, here is the corresponding linear plot, illustrating the percentage of contractors from various countries that are active at any given time:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-ZezJejcHznc/T8AJHTM3glI/AAAAAAAAxSg/JTWoZXunKG8/s1600/odesk-flower-flat.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="444" src="http://3.bp.blogspot.com/-ZezJejcHznc/T8AJHTM3glI/AAAAAAAAxSg/JTWoZXunKG8/s640/odesk-flower-flat.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Fighting overplotting using kernel smoothing and heatmaps&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
The plot above is kind of interesting and indeed it shows the pattern of activity. However, we have a lot of "overplotting", which makes the plot busy. It is hard to understand where the majority of the lines are falling. &lt;br /&gt;
&lt;br /&gt;
To understand better the flow of the lines, I decided to play a little bit with R. I loaded the data set with the activity line from each country, and then used kernel based smoothing (&lt;a href="http://stat.ethz.ch/R-manual/R-patched/library/KernSmooth/html/bkde2D.html" target="_blank"&gt;bkde2D&lt;/a&gt;) to find the regions of the space that had the highest density. To plot the result, I used a contour plot (&lt;a href="http://stat.ethz.ch/R-manual/R-patched/library/graphics/html/filled.contour.html"&gt;filled.contour&lt;/a&gt;), which allows for the easy generation of heatmaps. Here is the R code:&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/2968039.js"&gt;
 
&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
and here is the resulting plot:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-b0qBHkpUu9w/T-OFORaGdiI/AAAAAAAAyWM/ltwuB3ubqNQ/s1600/hourly-activity-percentages-odesk.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="396" src="http://1.bp.blogspot.com/-b0qBHkpUu9w/T-OFORaGdiI/AAAAAAAAyWM/ltwuB3ubqNQ/s640/hourly-activity-percentages-odesk.png" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;I like how this plot shows the typical activity across countries, which ranges from 2% to 6% of the total registered users. At the same time, we can see (the yellow-green "peaks) that there are also countries that have 8% to 10% of their users being active every week.&lt;/span&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Need for interactivity&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
So, what did I learn from all these exercises? While I could create nice plots, I felt that static visualization are at the end of limited value. Other people cannot do any dynamic exploration of the data. Nobody can customize the plot to show a slightly different view and in general we lack the flexibility given by, say, the &lt;a href="https://developers.google.com/chart/interactive/docs/gadgetgallery"&gt;visualization gadgets of Google&lt;/a&gt;&amp;nbsp;or by the data driven documents created using&amp;nbsp;&lt;a href="http://d3js.org/"&gt;d3.js&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
I would love to be able to create some more interactive plots and let other people play with and explore the data that oDesk has. Perhaps I should hire a contractor on oDesk to do that :-)&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=qTpxJCvp-eM:JbOf1p89fCE:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=qTpxJCvp-eM:JbOf1p89fCE:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=qTpxJCvp-eM:JbOf1p89fCE:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=qTpxJCvp-eM:JbOf1p89fCE:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=qTpxJCvp-eM:JbOf1p89fCE:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=qTpxJCvp-eM:JbOf1p89fCE:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=qTpxJCvp-eM:JbOf1p89fCE:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/qTpxJCvp-eM" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6320597158018161517?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6320597158018161517?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/qTpxJCvp-eM/odesk-flower-playing-with.html" title="The oDesk Flower: Playing with Visualizations" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-EFFS0_KafCE/T8AC1F-ZLiI/AAAAAAAAxSE/H1YG5r-8xmk/s72-c/odesk-active-workers-over-time.PNG" height="72" width="72" /><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/06/odesk-flower-playing-with.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0cAR3k6fip7ImA9WhVbE0w.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-4551893600924080862</id><published>2012-05-25T20:07:00.000-04:00</published><updated>2012-05-29T15:17:26.716-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-29T15:17:26.716-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="online labor" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>The Emergence of Teams in Online Work</title><content type="html">&lt;div style="text-align: justify;"&gt;
When I started as an assistant professor, back in 2004, and I joined the NYU/Stern Business School, I got into a strange position. I had funding to spend, but no students to work with. I had work to be done (mainly writing crawlers) that was time-consuming, but not particularly novel, or intellectually rewarding. Semi-randomly, at the same time, I have heard about the website Rent-A-Coder, which was being used by undergraduate students that were "outsourcing" their programming assignments. I started using Rent-A-Coder, tentatively at first, to get programming tasks done, and then, over time, I got fascinated by the concept of online work, and the ability to hire people online, and get things done. (My Mechanical Turk research, and my current appointment at oDesk is a natural evolution of these interests.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
As I started completing increasingly complicated projects using remote contractors, I started thinking on how we can best manage a diverse team of remote workers, each one being in a different location, working on different tasks, etc. The topic has many interesting questions that arise, both in terms of theory, and in terms of developing practical "best practices" guidelines.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
While trying to understand better the theoretical problems that arise in the space, I was reading the paper "&lt;a href="http://www2012.wwwconference.org/proceedings/proceedings/p839.pdf" target="_blank"&gt;Online Team Formation in Social Networks&lt;/a&gt;" that was published in WWW2012; the paper describes a technique for identifying teams of people in a social network (i.e., graph) that have complementary skills and can form a well-functioning unit, and tries to do so while preserving workload restrictions for individual workers.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Given my personal experience, from the practical side, and the existence of research papers that deal with the topic, I got curious to understand whether the topic of online team formation is a fringe topic, or something that deserves further attention.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Do we see teams being formed online? If yes, is this a phenomenon that increases in significance?&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, I pulled the oDesk data and tried to answer the question.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
How many teams have a given size? How this distribution evolves over time? I plotted the number of projects in each week that had x contractors that were active in the project (i.e., billed some time)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The results were revealing: Not only we observe teams of people being formed online but we also see an exponential increase in the number of teams of any given size.&amp;nbsp; &lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-CIjokHKFMHA/T8AYPMPHVTI/AAAAAAAAxS8/JTQkJv7cpqY/s1600/odesk-teams-size.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="350" src="http://3.bp.blogspot.com/-CIjokHKFMHA/T8AYPMPHVTI/AAAAAAAAxS8/JTQkJv7cpqY/s640/odesk-teams-size.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
In fact, in the above graph, if we account for the fact that bigger teams contain an (exponentially) larger number of people, we can see that the majority of the online workers today are not working as individuals but are now part of an online team. &lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-_9CCwRDJ2kk/T8AZt4jLNOI/AAAAAAAAxTI/UVcbX6qvbug/s1600/odesk-teams-size-contractors.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="354" src="http://3.bp.blogspot.com/-_9CCwRDJ2kk/T8AZt4jLNOI/AAAAAAAAxTI/UVcbX6qvbug/s640/odesk-teams-size-contractors.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Update&amp;nbsp;&lt;/b&gt;[&lt;i&gt;thanks for the question, Yannis!&lt;/i&gt;]&lt;b&gt;: &lt;/b&gt;Since the exponential growth of oDesk.com makes it difficult to understand the fraction of people working in teams and whether it is increasing/decreasing , here is the chart that shows what percentage of workers work in teams of a given size:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-WJH0SFtqRTA/T8Ue2YxB1AI/AAAAAAAAxZ4/4C2QcHuYRK8/s1600/odesk-teams-size-percentages.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="350" src="http://3.bp.blogspot.com/-WJH0SFtqRTA/T8Ue2YxB1AI/AAAAAAAAxZ4/4C2QcHuYRK8/s640/odesk-teams-size-percentages.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;
What is interesting is the consistent decrease in the fraction of people working along (teams of one), and in teams of 2-3. Instead, we see a slow but consistent increase in teams with size 4-7 and 8-16, as an overall fraction of the population. As you can see, over the last year, the percentage of contractors in teams with size 4-7 is getting close to surpass the number of contractors working along. Similarly, the percentage of contractors in teams of 8-16 is getting close to surpass the percentage of contractors in teams of 2-3. The trends for bigger teams &lt;i&gt;seem &lt;/i&gt;also to be increasing but there is still too much noise to be able to infer anything.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;What's coming?&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;
Given the trend for online work to be done in teams, &lt;b style="font-style: italic;"&gt;formed online,&lt;/b&gt; I&amp;nbsp;expect to see a change in the way that many companies are being formed in the future. At this point, it seems far fetched that a startup company can be formed online, being distributed across the globe, and operate on a common project. (Yes, there are such teams but they are more of an exception, rather than the norm.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
But if these trends continue, expect sooner rather than later to see companies naturally hiring online and working with remote collaborators, no matter where the talent is located. People have been talking about online work being an alternative to immigration, but this seemed to be a solution for the remote future.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
With the exponential increase that we observe, the future may come much sooner than expected.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=72PBAatvh-A:EYuAOEAy_-0:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=72PBAatvh-A:EYuAOEAy_-0:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=72PBAatvh-A:EYuAOEAy_-0:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=72PBAatvh-A:EYuAOEAy_-0:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=72PBAatvh-A:EYuAOEAy_-0:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=72PBAatvh-A:EYuAOEAy_-0:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=72PBAatvh-A:EYuAOEAy_-0:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/72PBAatvh-A" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/4551893600924080862?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/4551893600924080862?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/72PBAatvh-A/emergence-of-teams-in-online-work.html" title="The Emergence of Teams in Online Work" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-CIjokHKFMHA/T8AYPMPHVTI/AAAAAAAAxS8/JTQkJv7cpqY/s72-c/odesk-teams-size.PNG" height="72" width="72" /><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/05/emergence-of-teams-in-online-work.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkADSHg7fCp7ImA9WhVVFko.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-6671617875093882777</id><published>2012-05-10T15:26:00.000-04:00</published><updated>2012-05-10T15:39:39.604-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-10T15:39:39.604-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="trec" /><category scheme="http://www.blogger.com/atom/ns#" term="image classification" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>TREC 2012 Crowdsourcing Track</title><content type="html">&lt;h3&gt;


TREC 2012 Crowdsourcing Track -&amp;nbsp;Call for Participation&lt;/h3&gt;
&amp;nbsp;June 2012 – November 2012&lt;br /&gt;
&lt;a href="http://www.google.com/url?sa=D&amp;amp;q=https://sites.google.com/site/treccrowd/&amp;amp;usg=AFQjCNHjMQsQmCcq9dC6yrGUFE3VEdmoEA"&gt;https://sites.google.com/site/treccrowd/&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;h4&gt;


Goals&lt;/h4&gt;
As part of the &lt;a href="http://www.nist.gov/"&gt;National Institute of Standards and Technology (NIST)&lt;/a&gt;'s annual &lt;a href="http://trec.nist.gov/"&gt;Text REtrieval Conference (TREC)&lt;/a&gt;, the Crowdsourcing track investigates emerging crowd-based methods for search evaluation and/or developing hybrid automation and crowd search systems.&lt;br /&gt;
&lt;br /&gt;
This year, our goal is to evaluate approaches to crowdsourcing high quality relevance judgments for two different types of media:&lt;br /&gt;
&lt;div&gt;
&lt;ol&gt;
&lt;li&gt;textual documents&lt;/li&gt;
&lt;li&gt;images&lt;/li&gt;
&lt;/ol&gt;
For each of the two tasks, participants will be expected to crowdsource relevance labels for approximately 20k topic-document pairs (i.e., 40k labels when taking part in both tasks). In the first task, the documents will be from an English news text corpora, while in the second task the documents will be images from Flickr and from a European news agency.&lt;br /&gt;
&lt;br /&gt;
Participants may use any crowdsourcing methods and platforms, including home-grown systems. Submissions will be evaluated against a gold standard set of labels and against consensus labels over all participating teams.&lt;br /&gt;
&lt;br /&gt;
&lt;h4&gt;


Tentative Schedule&lt;/h4&gt;
&lt;ul&gt;
&lt;li&gt;Jun 1: Document corpora, training topics (for image task) and task guidelines available&lt;/li&gt;
&lt;li&gt;Jul 1: Training labels for the image task&lt;/li&gt;
&lt;li&gt;Aug 1: Test data released&lt;/li&gt;
&lt;li&gt;Sep 15: Submissions due&lt;/li&gt;
&lt;li&gt;Oct 1: Preliminary results released&lt;/li&gt;
&lt;li&gt;Oct 15: Conference notebook papers due&lt;/li&gt;
&lt;li&gt;Nov 6-9: TREC 2012 conference at NIST, Gaithersburg, MD, USA&lt;/li&gt;
&lt;li&gt;Nov 15: Final results released&lt;/li&gt;
&lt;li&gt;Jan 15, 2013: Final papers due&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4&gt;


 Participation&lt;/h4&gt;
To take part, please register by submitting a formal application directly to NIST (even if returning participant). See&amp;nbsp;the bottom part of the page at &lt;a href="http://www.google.com/url?sa=D&amp;amp;q=http://trec.nist.gov/pubs/call2012.html&amp;amp;usg=AFQjCNH0tBRTRhVBwSb5bv1i8_aR-iPoGw"&gt;http://trec.nist.gov/pubs/call2012.html&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Participants should also join our &lt;a href="http://groups.google.com/group/trec-crowd"&gt;Google Group&lt;/a&gt; discussion list, where all track related communications will take place.&lt;br /&gt;
&lt;br /&gt;
&lt;h4&gt;


Organizers&lt;/h4&gt;
&lt;ul&gt;
&lt;li&gt;Gabriella Kazai, Microsoft Research&lt;/li&gt;
&lt;li&gt;Matthew Lease, University of Texas at Austin&lt;/li&gt;
&lt;li&gt;Panagiotis G. Ipeirotis, New York University&lt;/li&gt;
&lt;li&gt;Mark D. Smucker, University of Waterloo&lt;/li&gt;
&lt;li&gt;&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h4&gt;


 Further information&lt;/h4&gt;
For further information, please visit &lt;a href="http://www.google.com/url?sa=D&amp;amp;q=https://sites.google.com/site/treccrowd/&amp;amp;usg=AFQjCNHjMQsQmCcq9dC6yrGUFE3VEdmoEA"&gt;https://sites.google.com/site/treccrowd/&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
We welcome any questions you may have, either by emailing the organizers or by posting on the Google Group discussion page.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=7N8rCcPMSU4:p3KvvVEkvX8:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=7N8rCcPMSU4:p3KvvVEkvX8:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=7N8rCcPMSU4:p3KvvVEkvX8:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=7N8rCcPMSU4:p3KvvVEkvX8:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=7N8rCcPMSU4:p3KvvVEkvX8:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=7N8rCcPMSU4:p3KvvVEkvX8:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=7N8rCcPMSU4:p3KvvVEkvX8:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/7N8rCcPMSU4" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6671617875093882777?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/6671617875093882777?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/7N8rCcPMSU4/trec-2012-crowdsourcing-track.html" title="TREC 2012 Crowdsourcing Track" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/05/trec-2012-crowdsourcing-track.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkAEQX09eip7ImA9WhVVFko.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-9010225003686511438</id><published>2012-05-05T01:14:00.000-04:00</published><updated>2012-05-10T15:38:20.362-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-10T15:38:20.362-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="economics" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="ec2012" /><category scheme="http://www.blogger.com/atom/ns#" term="conference" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="acm" /><title>ACM EC 2012 schedule</title><content type="html">Schedule at a glance:&lt;br /&gt;
&lt;br /&gt;
&lt;iframe frameborder="0" height="750" src="https://docs.google.com/spreadsheet/pub?key=0AjX1e06EhsXSdEVIUVgyb2hGcV9fcmxJTy03RjBxNnc&amp;amp;single=true&amp;amp;gid=5&amp;amp;output=html&amp;amp;widget=true" width="620"&gt;&lt;/iframe&gt;&lt;br /&gt;
&lt;br /&gt;
And the papers within each session:&lt;br /&gt;
&lt;br /&gt;
&lt;iframe frameborder="0" height="300" src="https://docs.google.com/spreadsheet/pub?key=0AjX1e06EhsXSdEVIUVgyb2hGcV9fcmxJTy03RjBxNnc&amp;amp;single=true&amp;amp;gid=4&amp;amp;output=html&amp;amp;widget=true" width="620"&gt;&lt;/iframe&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=hjsHhK5HWtY:zQ1zq1Cm-18:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=hjsHhK5HWtY:zQ1zq1Cm-18:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=hjsHhK5HWtY:zQ1zq1Cm-18:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=hjsHhK5HWtY:zQ1zq1Cm-18:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=hjsHhK5HWtY:zQ1zq1Cm-18:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=hjsHhK5HWtY:zQ1zq1Cm-18:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=hjsHhK5HWtY:zQ1zq1Cm-18:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/hjsHhK5HWtY" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/9010225003686511438?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/9010225003686511438?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/hjsHhK5HWtY/acm-ec-2012-schedule.html" title="ACM EC 2012 schedule" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/05/acm-ec-2012-schedule.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0ECR3s7eCp7ImA9WhVWF0w.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-2229459526419117000</id><published>2012-04-25T14:09:00.000-04:00</published><updated>2012-04-29T11:01:06.500-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-29T11:01:06.500-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="attack" /><category scheme="http://www.blogger.com/atom/ns#" term="amazon" /><category scheme="http://www.blogger.com/atom/ns#" term="google spreadsheet" /><category scheme="http://www.blogger.com/atom/ns#" term="cloud computing" /><category scheme="http://www.blogger.com/atom/ns#" term="google" /><title>The Google attack: How I attacked myself using Google Spreadsheets and I ramped up a $1000 bandwidth bill</title><content type="html">&lt;div class="tr_bq"&gt;
&lt;div style="text-align: justify;"&gt;
It all started with an email.&lt;/div&gt;
&lt;/div&gt;
&lt;blockquote style="background-color: #eeeeee; border-radius: 2px; padding: 5px 5px 5px 5px;"&gt;
&lt;div style="text-align: justify;"&gt;
From: Amazon Web Services LLC&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Subject: Review of your AWS Account Estimated Month to Date Billing Charges of $720.85&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Greetings from AWS,&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
During a routine review of your AWS Account's estimated billing this month, we noticed that your charges thus far are a bit larger than previous monthly charges. We'd like to use this opportunity to explore the features and functionality of AWS that led you to rely on AWS for more of your needs.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
You can view your current estimated monthly charges by going here:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span style="font-size: x-small;"&gt;https://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&amp;amp;action=activity-summary&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
AWS Account ID: XXXXXXX27965&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Current Estimated Charges: $720.85&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
If you have any feedback on the features or functionality of AWS that has helped enable your confidence in our services to begin ramping your usage we would like to hear about it. &amp;nbsp;Additionally, if you have any questions pertaining to your billing, please contact us by using the email address on your account and logging in to your account here:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
https://aws-portal.amazon.com/gp/aws/html-forms-controller/contactus/aws-account-and-billing&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Regards,&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
AWS Customer Service&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
This message was produced and distributed by Amazon Web Services LLC, 410 Terry Avenue North, Seattle, Washington 98109-5210&lt;/div&gt;
&lt;/blockquote&gt;
&lt;div style="text-align: justify;"&gt;
What? \$720 in charges? My usual monthly charges for Amazon Web Services were around \$100, so getting this email with a usage of \$720 after just two weeks within the month was a big alert. I login to my account to see what is going on, and I see this:&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-mYw6B-9pSlI/T48zyTtfgRI/AAAAAAAAwJY/oGoDH2BMzdk/s1600/aws-statement.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://2.bp.blogspot.com/-mYw6B-9pSlI/T48zyTtfgRI/AAAAAAAAwJY/oGoDH2BMzdk/s640/aws-statement.PNG" width="474" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
An even bigger number: \$1177.76 in usage charges! A thousand, one hundred, seventy seven dollars. Out of which \$1065 in outgoing bandwidth transfer costs. The scary part: 8.8&amp;nbsp;&lt;span style="color: #660000;"&gt;&lt;b&gt;Terabytes&lt;/b&gt;&amp;nbsp;&lt;/span&gt;of outgoing traffic! Tera. Not Giga. Terabytes.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
To make things worse, I realized that the cost was going up &lt;i&gt;hour after hour&lt;/i&gt;. Fifty to hundred dollars more in billing charges with &lt;b&gt;each. passing. hour&lt;/b&gt;.&amp;nbsp;I started sweating.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;hr style="background-color: #691f01; border-bottom-width: 1px; border-color: initial; border-left-width: 1px; border-right-width: 1px; border-style: initial; border-top-width: 1px; color: #691f01; display: block; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;What happened?&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Initially I was afraid that a script that I setup to backup my photos from my local network to S3 consumed that bandwidth. But then I realized that I am running this backup-to-S3 script for a few months now, so it could not suddenly start consuming more resources. In any case, all the traffic that is incoming to S3 is free. This was a matter of &lt;i&gt;outgoing &lt;/i&gt;traffic.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Then I started suspecting that the cause of this spike maybe due to the developers that are working in various projects of mine. Could they have mounted the S3 bucket into an EC2 machine that is in a different region? In that case, we may have indeed problems, as all the I/O operations that are happening within a machine would count as bandwidth costs. I checked all my EC2 machines. No, this is not the problem. All EC2 machines are in us-east, and my S3 buckets are all in US Standard. No charges for operations between EC2 machines and S3 buckets within the same region.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
What could be causing this? Unfortunately, I did not have any logging enabled to my S3 buckets. I enabled logging and expected to see what would happen next. But logging would take a few hours, and the bandwidth meter was running. No time to waste.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Thankfully, even in the absence of logging, Amazon provides access to the &lt;a href="https://aws-portal.amazon.com/gp/aws/developer/account/index.html?action=usage-report" target="_blank"&gt;usage reports&lt;/a&gt; of all the AWS resources. The report indicated the bucket that was causing the problem:&lt;/div&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-eEHSBh-6pQs/T484JUqUH9I/AAAAAAAAwJo/6lx6bYWs9iM/s1600/aws-usagereport.PNG" imageanchor="1"&gt;&lt;img border="0" height="91" src="http://3.bp.blogspot.com/-eEHSBh-6pQs/T484JUqUH9I/AAAAAAAAwJo/6lx6bYWs9iM/s640/aws-usagereport.PNG" width="600" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
My S3 bucket with the name "t_4e1cc9619d4aa8f8400c530b8b9c1c09" was generating 250GB of outgoing traffic, per hour.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;br /&gt;
&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;div style="text-align: center;"&gt;
&lt;b&gt;Two-hundred-fifty Gigabytes. Per &lt;i&gt;hour&lt;/i&gt;.&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;br /&gt;
&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
At least I knew what was the source of the traffic. It was a big bucket with images that were being used for a variety of tasks on Amazon Mechanical Turk.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
But still something was strange. The bucket was big, approximately 250GB of images. Could Mechanical Turk generate so much traffic?&amp;nbsp;Given that on average the size of each image was 500Kb to 1MB,&amp;nbsp;the bucket should have been serving 250,000 images per hour.&amp;nbsp;&lt;b&gt;This is 100+ requests&amp;nbsp;per second.&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
There was no way that Mechanical Turk was responsible for this traffic.&amp;nbsp;The cost of Mechanical Turk would have trumpeted the cost of bandwidth.&amp;nbsp;Somehow the S3 bucket was being "Slashdotted" but without being featured on Slashdot or in any other place that I was aware of.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Strange.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Very strange.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;hr style="background-color: #691f01; border-bottom-width: 1px; border-color: initial; border-left-width: 1px; border-right-width: 1px; border-style: initial; border-top-width: 1px; color: #691f01; display: block; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Checking the Logs&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Well, I enabled logging for the S3 bucket, so I was waiting for the logs to appear.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The first logs showed up and I was in a for a surprise. Here are the IP's and the User-agent of the requests.&lt;/div&gt;
&lt;pre style="background-color: #eeeeee; border-radius: 2px; padding: 5px 5px 5px 5px;"&gt;&lt;div style="text-align: justify;"&gt;
&lt;span style="font-size: x-small;"&gt;74.125.156.82 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.83 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.84 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.81 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.86 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.92 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.87 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.81 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.82 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.85 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.89 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.83 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.90 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.92 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.85 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.82 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.88 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.86 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.89 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.83 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.94 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.83 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.88 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.83 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.92 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.156.80 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.64.88 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.84 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
74.125.158.87 Mozilla/5.0 (compatible) Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
&lt;/span&gt;&lt;/div&gt;
&lt;/pre&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
So, it was Google that was crawling the bucket. Aggressively. Very&amp;nbsp;aggressively.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Why would Google crawl this bucket?&amp;nbsp;Yes, the URLs were technically public but there was no obvious place to get the URLs. Google could not have gotten the URLs from Mechanical Turk. The images in the tasks posted to Mechanical Turk&amp;nbsp;are not accessible to Google to crawl.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
At least we know it is Google. I guess, somehow, I let Google learn about the URLs of the images in the bucket (how?) and Google started crawling them. But something was still puzzling.&amp;nbsp;How can an S3 bucket with 250Gb of data generate 40 times that amount of traffic? Google would just download once and get done with that. It would not&amp;nbsp;re-crawl&amp;nbsp;the same object many times.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
I checked the logs again. Interestingly enough, there was a pattern: Each image was being downloaded every hour. Every single one of them. Again and again.&amp;nbsp;Something was very very strange. Google kept launching its crawlers, repeatedly, to download&amp;nbsp;&lt;b&gt;&lt;i&gt;the same&amp;nbsp;&lt;/i&gt;&lt;/b&gt;content in the S3 bucket, every hour. For a total of 250GB of traffic, every hour.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Google would have been smarter than that. Why wasting all the bandwidth to&amp;nbsp;re-download&amp;nbsp;an identical image every hour?&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Why would Google&lt;i&gt; download the same images again and again&lt;/i&gt;?&lt;/div&gt;
&lt;br /&gt;
&lt;hr style="background-color: #691f01; border-color: initial; border-style: initial; color: #691f01; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Wait, this is &lt;i&gt;not &lt;/i&gt;the &lt;i&gt;real &lt;/i&gt;Google crawler...&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Looking more carefully, there was one red flag. This is &lt;b&gt;not &lt;/b&gt;the Google crawler. &lt;a href="http://support.google.com/webmasters/bin/answer.py?hl=en&amp;amp;answer=1061943" target="_blank"&gt;The Google crawler is named &lt;b&gt;GoogleBot &lt;/b&gt;for web pages and &lt;b&gt;Googlebot-Image&lt;/b&gt; for images&lt;/a&gt;. It is &lt;b&gt;not &lt;/b&gt;called Feedfetcher as this user agent.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
What the heck is Feedfetcher? A few interesting pieces of information&amp;nbsp;&lt;a href="http://support.google.com/webmasters/bin/answer.py?hl=en&amp;amp;answer=178852" target="_blank"&gt;from Google&lt;/a&gt;:&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;i&gt;Feedfetcher is how Google grabs RSS or Atom feeds when users choose to add them to their Google homepage or Google Reader&lt;/i&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;i&gt;Feedfetcher retrieves feeds only after users have explicitly added them to their Google homepage or Google Reader&lt;/i&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;i&gt;[Feedfetcher] is not retrieving content to be added to Google's search index&lt;/i&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;i&gt;Feedfetcher retrieves feeds only after users have explicitly added them to their Google homepage or Google Reader. Feedfetcher behaves as a direct agent of the human user, not as a robot, &lt;b&gt;so it ignores robots.txt&amp;nbsp;&lt;/b&gt;&lt;/i&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
Interesting. So these images were in some form of a &lt;i&gt;personal &lt;/i&gt;feed.&lt;/div&gt;
&lt;br /&gt;
&lt;hr style="background-color: #691f01; border-color: initial; border-style: initial; color: #691f01; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Shooting myself in the foot, the Google Spreadsheet way&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
And this information started unraveling the full story. I remembered!&lt;br /&gt;
&lt;br /&gt;
All the URLs for these images were also stored in a Google Spreadsheet, so that I can inspect the results of the crowdsourcing process.&amp;nbsp;(The spreadsheet was &lt;b&gt;&lt;i&gt;not &lt;/i&gt;&lt;/b&gt;being used or accessed by Mechanical Turk workers, it was just for viewing the results.)&amp;nbsp;I used the &lt;span style="font-family: 'Courier New', Courier, monospace;"&gt;&lt;a href="http://support.google.com/docs/bin/answer.py?hl=en-GB&amp;amp;answer=87037" target="_blank"&gt;=image(url)&lt;/a&gt;&lt;/span&gt; command to display a thumbnail of the image in a spreadsheet cell.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, all this bandwidth waste was triggered by my own stupidity. I asked Google to download all the images to create the thumbnails in Google Spreadsheet. Talking about shooting myself in the foot. I launched the Google crawler myself.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
But why did Google download the images again and again? That seemed puzzling. It seemed perfectly plausible that Google would fetch 250Gb of data (i.e., the total size of the bucket), although I would have gone for a lazy evaluation approach (i.e., loading on demand, as opposed to pre-fetching). But why downloading the &lt;b&gt;&lt;i&gt;same &lt;/i&gt;&lt;/b&gt;content again and again?&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Well, the explanation is simple: Apparently Google is using Feedfetcher as a "url fetcher" for all sorts of "personal" URLs someone adds to its services, and not only for feeds. Since these URLs are private, Google does not want to store them anywhere permanently in the Google servers. Makes perfect sense from the point of view of respecting user privacy. The problem is that this &lt;b&gt;does not allow for any form of caching&lt;/b&gt;, as Google does not store anywhere the personal data.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, every hour, Google was launching the crawlers against my bucket, generating a tremendous amount of crawler traffic. Notice that even if I had a robots.txt, Feedfetcher would have ignored it in any case. (Furthermore, it is not possible to place a robots.txt file in the root directory of https://s3.amazonaws.com as this is a common server for many different accounts; but in any case Feedefetcher would have ignored it.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The final touch in the overall story? Normally, if you were to do the same thing with URLs from a random website, Google would have rate limited its crawlers, not to overload the website. However, the&amp;nbsp;s3.amazonaws.com domain is a huuuge domain, containing&amp;nbsp;terabytes&amp;nbsp;(petabytes?) of web content. Google has no reason to rate limit against such a huge domain with huge traffic. It made perfect sense to launch 100+ connections per second against a set of URLs that were hosted in that domain...&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, I did not just shoot myself in the foot. I took a &lt;a href="http://en.wikipedia.org/wiki/Tsar_Bomba" target="_blank"&gt;Tsar Bomba&lt;/a&gt;&amp;nbsp;and I launched it against my foot. The $1000 bandwidth bill (generated pretty much within a few hours) was the price of my stupidity.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Ooof, mystery solved. I killed the spreadsheet and make the images private. Google started getting 403 errors, and I hope that it will soon stop. Expensive mistake, but at least resolved.&lt;br /&gt;
&lt;br /&gt;
And you cannot help but laugh at the following irony: One of the main arguments for using the AWS infrastructure is that it is virtually invincible to any denial of service attack. On the other hand, the avoidance of the denial of service breeds a new type of attack: Bring the service down not by stopping the service but by making it extremely expensive to run...&lt;/div&gt;
&lt;br /&gt;
&lt;hr style="background-color: #691f01; border-color: initial; border-style: initial; color: #691f01; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;The real lesson: Google as a medium for launching an attack against others&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;br /&gt;
&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Then I realized: This is a technique that can be used to launch a denial of service attack against a website hosted on Amazon (or even elsewhere). The steps:&lt;/div&gt;
&lt;ol&gt;
&lt;li style="text-align: justify;"&gt;Gather a large number of URLs from the&amp;nbsp;targeted&amp;nbsp;website. Preferably big media files (jpg, pdf, etc)&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Put these URLs in a Google feed, or just put them in a Google Spreadsheet&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Put the feed into a Google service, or use the image(url) command in Google spreadsheet&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Sit back and enjoy seeing Google launching a Slashdot-style denial of service attack against your target.&lt;/li&gt;
&lt;/ol&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
What I find fascinating in this setting is that Google becomes such a powerful weapon due to a series of&amp;nbsp;perfectly&amp;nbsp;legitimate design decisions. First, they separate completely their index from the URLs that they fetch for private purposes. Very clean and nice design. The problem? No caching. Second, Google is not doing lazy evaluation in the feeds but tries to pre-fetch them to be ready and fresh for the user. The problem? Google is launching its Feedfetcher crawlers again and again.&amp;nbsp;Combine the two, and you have a very, very powerful tool that can generate untraceable denials of service attacks.&amp;nbsp;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
The law of unintended consequences. Scary and instructive at the same time: You never know how the tools that you build can be used, no matter how noble the intentions and the design decisions.&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;PS: Amazon was nice enough to refund the bandwidth charges (before the post went public), as they considered this activity accidental and not intentional. Thanks TK!&lt;/i&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=ndQaawiK0Hw:OIFqdhDlguU:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=ndQaawiK0Hw:OIFqdhDlguU:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=ndQaawiK0Hw:OIFqdhDlguU:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=ndQaawiK0Hw:OIFqdhDlguU:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=ndQaawiK0Hw:OIFqdhDlguU:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=ndQaawiK0Hw:OIFqdhDlguU:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=ndQaawiK0Hw:OIFqdhDlguU:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/ndQaawiK0Hw" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2229459526419117000?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2229459526419117000?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/ndQaawiK0Hw/google-attack-how-i-self-attacked.html" title="The Google attack: How I attacked myself using Google Spreadsheets and I ramped up a $1000 bandwidth bill" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-mYw6B-9pSlI/T48zyTtfgRI/AAAAAAAAwJY/oGoDH2BMzdk/s72-c/aws-statement.PNG" height="72" width="72" /><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/04/google-attack-how-i-self-attacked.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkMDRH8yeyp7ImA9WhVWE04.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-2421159814440420523</id><published>2012-04-25T00:08:00.002-04:00</published><updated>2012-04-25T03:21:15.193-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-25T03:21:15.193-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="reviews" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="reputation" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Feedback, Unemployment, and Crowdsourcing: A Modest Proposal</title><content type="html">&lt;div style="text-align: justify;"&gt;
I had finished reading the paper "&lt;a href="http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2012131" target="_blank"&gt;Inefficient Hiring in Entry-Level Labor Markets&lt;/a&gt;" by Amanda Pallais, an assistant professor of Economics at Harvard University.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
This is the first paper that I have read that provides experimental evidence that labor markets are "not efficient" in the following way: If we have a new worker, or a worker with no known past history, we do not know what the worker can and cannot do. Most employers will not hire this worker due to this lack of knowledge. And since the worker is never hired, nobody is able to leave feedback about the performance of the worker. &amp;nbsp;This leads to a vicious cycle for the new entrants, that cannot break into the market because they do have feedback, and they cannot get feedback because they cannot get into the market.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
While this&amp;nbsp;phenomenon is known, it was not obvious that lack of feedback is causing this inefficiency. The alternative explanation was that good workers will find work to do, and bad workers simply do not get jobs because they do not even know how to apply and enter the market efficiently.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
What Amanda did was pretty interesting. She created a randomized experiment. She used oDesk and&amp;nbsp;opened a position for data entry, a position that required pretty much no special skills. She received approximately 3000 job applications. Out of these, she hired randomly 1000 workers. The 2000 non-hired workers formed the "control" group. Within the 1000 workers, she created two groups. One that received a detailed public feedback and evaluation, and another that received a generic, uninformative feedback (e.g., "Good work").&amp;nbsp;Given the randomized selection, the differences in the future evolution of the workers were pretty much the result of the treatments in this controlled field experiment.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The results were revealing:&lt;/div&gt;
&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;Workers&amp;nbsp;randomly selected to receive jobs were more likely to be employed, requested&amp;nbsp;higher wages, and had higher earnings than control group workers.&amp;nbsp;&lt;/span&gt;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;In the two months after&amp;nbsp;the experiment, &lt;b&gt;inexperienced workers' earnings approximately &lt;i&gt;&lt;span style="color: #660000;"&gt;tripled &lt;/span&gt;&lt;/i&gt;as a result of obtaining a job&lt;/b&gt;.&amp;nbsp;&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;Providing workers with&lt;b&gt; more detailed evaluations substantially increased their&amp;nbsp;earnings&lt;/b&gt; and the wages they requested.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;The&amp;nbsp;benefits of detailed evaluations were not universal: &lt;b&gt;detailed performance evaluations helped&amp;nbsp;those who performed well and hurt those who performed poorly&lt;/b&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;
Even more notable, the benefit of the workers that received the "you get a job" treatment, did not come at the expense of other workers. Employment increased and the money that were "wasted" to conduct the experiment (the tasks were not useful to anyone) generated enough return to cover the cost.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
In principle, oDesk may want to engage into such "wasteful" hiring just to get workers to bootstrap and start with some meaningful feedback in their profiles: When you create an account at oDesk, you get a random job (for which nobody cares) and then the quality of the submitted work is evaluated, to generate some meaningful feedback for the worker (e.g., "great at setting up a map reduce task on Amazon Web Services")&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Or, perhaps, they can skip the wasteful part, and use crowdsourcing as a perfectly valid mechanism for generating this valuable public feedback by letting people do actual work.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Crowdsoucing as a solution to the cold start problem&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Note how this need for early feedback so that workers can enter the market naturally leads to crowdsourcing as a solution to the entrance problem.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
If getting a job is the blocker for starting your career, then crowdsourcing allows new entrants to pick jobs &lt;a href="http://www.behind-the-enemy-lines.com/2012/02/crowdsourcing-end-of-job-interviews.html" target="_blank"&gt;without having to worry about the interview process&lt;/a&gt;. Just pick an available task and do it.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The findings of the study also suggest that crowdsourcing by itself is not enough. Any crowdsourcing application that provides jobs should be accompanied by a detailed feedback/scoring system. For example, if the crowdsourcing platform is about, say, translation, then there should be public feedback that will list the tasks that the person completed (what language pairs, etc), and list the corresponding performance statistics (e.g., time taken to complete the task, quality of the outcome, etc.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
In a setting like this, crowdsourcing becomes not a novelty item but an integral part of any labor platform, facilitating entry of the workers. It is not a place where jobs get done on the cheap. It is the place that generates information about the quality of the workers, which in turn makes the workers more valuable to the firms.&amp;nbsp;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #660000;"&gt;Should crowdsourcing firms receive favorable treatment by the government?&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
So, if crowdsourcing tasks &lt;span style="color: #660000;"&gt;&lt;b&gt;that generate *&lt;i&gt;public*&amp;nbsp;&lt;/i&gt;feedback for the performance of the participating workers&lt;/b&gt;&lt;/span&gt; benefit the workers, the future employers, and the overall society (by decreasing unemployment), the question is why not encouraging companies to make more of their work available in such format. While a service like Mechanical Turk would not qualify (anonymity of workers, plus lack of reputation), other services that generate useful public information could be the focus of favorable legislation and/or tax treatment.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Perhaps it is time to give to crowdsourcing the attention and stature it deserves.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Bg66uhmuaoI:D8STyvDzAhk:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Bg66uhmuaoI:D8STyvDzAhk:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Bg66uhmuaoI:D8STyvDzAhk:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Bg66uhmuaoI:D8STyvDzAhk:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=Bg66uhmuaoI:D8STyvDzAhk:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Bg66uhmuaoI:D8STyvDzAhk:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=Bg66uhmuaoI:D8STyvDzAhk:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/Bg66uhmuaoI" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2421159814440420523?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/2421159814440420523?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/Bg66uhmuaoI/feedback-unemployment-and-crowdsourcing.html" title="Feedback, Unemployment, and Crowdsourcing: A Modest Proposal" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/04/feedback-unemployment-and-crowdsourcing.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D04FQHk5fCp7ImA9WhVQFk4.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-7818591148148398683</id><published>2012-04-03T11:01:00.000-04:00</published><updated>2012-04-05T10:25:11.724-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-05T10:25:11.724-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><category scheme="http://www.blogger.com/atom/ns#" term="odesk" /><title>Philippines: The country that never sleeps (or, When is the world working? The oDesk Edition)</title><content type="html">&lt;div style="text-align: justify;"&gt;&lt;b&gt;Why are you awake?&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
Over the last few months, I have used oDesk to hire a couple of virtual assistants, who help me with a variety of tasks. They are coming from Philippines and we communicate over Skype whenever I have tasks for them to do. (Hi Maria! Hi Reineer!). One of the things that I found puzzling was the fact that they seemed to be online during the working hours in New York, despite the fact that we have a 12 hour difference with Manila. When I asked them, they told me that most of the time they work for US-based clients, and their work is much easier when they are synchronized with a US-schedule (real-time interactions with the clients, and so on). So they tend to stay awake until late at night and then sleep during their morning in Philippines.&lt;br /&gt;
&lt;br /&gt;
I found that behavior strangely fascinating, so I decided to dig deeper and figure out if this is some quirkiness of my own virtual assistants, or whether this is a more systematic pattern.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;The oDesk Team client: All-you-can-eat data&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
One characteristic that differentiates oDesk from other online labor platforms is the focus on hourly contracts, instead of project-based or piecemeal contracts. To enable truthful billing, oDesk asks the service providers to use the oDesk client whenever they are billing time. The client records the time billed and at the same time it takes screenshots&amp;nbsp;at random intervals&amp;nbsp;(that are given to the client who pays, only) and records the level of activity on the computer. This, in turn, ensures that clients can audit what service providers were doing while they were billing hours for work.&lt;br /&gt;
&lt;br /&gt;
So, I got the data recorded by the oDesk Team client that show when a worker is active. I plotted the number of active workers at different times of the day (time is local to the location of the service provider, and not the global UTC time), for various days of the week. Here is the plot with numbers from the top-7 countries, ranked by number of workers:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-FQsuVWo2KS4/T3uqHP06W2I/AAAAAAAAviY/1di3Ke2dXag/s1600/odesk-active-workers-over-time.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="250" src="http://3.bp.blogspot.com/-FQsuVWo2KS4/T3uqHP06W2I/AAAAAAAAviY/1di3Ke2dXag/s640/odesk-active-workers-over-time.PNG" width="580" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
One thing that is immediately interesting: &lt;b&gt;&lt;span style="color: #990000;"&gt;Philippines never sleeps!&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
All other countries have very natural patterns of being awake and asleep; Philippines is an exception. We see that the minimum for Philippines rarely drops below 5,000 active workers! All other countries (combined!) in their downtime time cannot beat Philippines in their low time. The supply of work is very constant over time.&lt;br /&gt;
&lt;br /&gt;
There are a couple of natural break points (see the small dip around lunch time and another one at around dinner time) but even during the (Philippines) night the work keeps going on. In fact, you can see clearly the peak of employment is at around 9pm-10pm in Philippines, which is the time that the East Coast in the US starts working as well. The low point for Philippines is at around 4am-5am their time, which is 4pm-5pm in the East Coast.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;Update&lt;/span&gt;:&lt;/b&gt; A couple of fascinating comments from the &lt;a href="http://news.ycombinator.com/item?id=3794600" target="_blank"&gt;Hacker News thread&lt;/a&gt; for this post:&lt;br /&gt;
&lt;br /&gt;
&lt;blockquote class="tr_bq"&gt;I have cousins that work at help desks in the Philippines, and their work schedules are designed to match US time zones. &lt;b&gt;After work, they hang out at bars with happy hours designed for them - I believe around ten in the morning&lt;/b&gt;. They hang out, then go home to sleep for the rest of the day.&amp;nbsp;Globalisation at work.&lt;/blockquote&gt;&lt;br /&gt;
&lt;blockquote class="tr_bq"&gt;I'm a Filipino Developer. This is actually an alternative for us developers in the Philippines, instead of going abroad working overseas which will be very far from our families. We got a lot of opportunities from foreigners who want to outsource their development projects. This earns us quite substantial income Although it's not as high as when your really working abroad, being with your family and seeing your children grow up mostly makes up for it. &lt;b&gt;Staying up late is not that hard as me myself is most productive at night when kids are asleep. I know most programmers share this work time.&lt;/b&gt;&lt;/blockquote&gt;&lt;br /&gt;
&lt;b&gt;The Data&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
For those that want to play more with the data, here is a link to a Google Spreadsheet. If you want more details or a slightly different view of the data, I would be happy to dig more in the oDesk database. &lt;br /&gt;
&lt;br /&gt;
&lt;div style="text-align: center;"&gt;&lt;iframe frameborder="0" height="300" src="https://docs.google.com/spreadsheet/pub?key=0AjX1e06EhsXSdENLRVR4UzA4Q1QzZ3Y5a2xSOC0zY0E&amp;amp;single=true&amp;amp;gid=0&amp;amp;output=html&amp;amp;widget=true" width="600"&gt;&lt;/iframe&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;b&gt;What is the application? Real-time human computation&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
So, why do we care that Philippines is awake all the time? The immediate benefit is that getting a team in Philippines can ensure the availability of labor for handling real-time tasks. If you have a human-powered application, you do not want to have any dead periods of time, where the application is slowing down or becomes completely unresponsive. However, by hiring people from Philippines, it is possible to have a "private crowd" available around the clock, by simply asking the Philippines contractors to "show up" at different points during the day/week.&lt;br /&gt;
&lt;br /&gt;
What is the difference with other services? If you hire a big outsourcing company, then the expectation is that they will work during (their) normal business hours, leaving the service down for many hours. On Mechanical Turk, this drop in performance comes naturally. If you restrict your tasks to US only, the speed drops when US goes to sleep. If you run the task on India, the same thing will happen. (Mixing the two crowds tend to result in many complications as the expectations for price are very different and Indians tend to overwhelm tasks that are priced for US workers.)&lt;br /&gt;
&lt;br /&gt;
Overall, Philippines seems to have a nice balance of availability throughout the day, and generally low prices. In terms of quality, things tend to be somewhere between US and India, so careful screening and quality control is important. But for many people experienced with managing crowds, it seems that Philippines is a great source of "crowds."&lt;br /&gt;
&lt;br /&gt;
Myself, I have already put my money where my mouth is, across multiple crowd applications that I have built.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=rJ6cmReJDzo:R64elo_XVuM:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=rJ6cmReJDzo:R64elo_XVuM:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=rJ6cmReJDzo:R64elo_XVuM:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=rJ6cmReJDzo:R64elo_XVuM:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=rJ6cmReJDzo:R64elo_XVuM:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=rJ6cmReJDzo:R64elo_XVuM:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=rJ6cmReJDzo:R64elo_XVuM:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/rJ6cmReJDzo" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7818591148148398683?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/7818591148148398683?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/rJ6cmReJDzo/when-is-world-working-odesk-edition-or.html" title="Philippines: The country that never sleeps (or, When is the world working? The oDesk Edition)" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-FQsuVWo2KS4/T3uqHP06W2I/AAAAAAAAviY/1di3Ke2dXag/s72-c/odesk-active-workers-over-time.PNG" height="72" width="72" /><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/04/when-is-world-working-odesk-edition-or.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0AFR3gyfyp7ImA9WhVQFEs.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-1242349411757255342</id><published>2012-03-26T13:20:00.003-04:00</published><updated>2012-04-03T11:08:36.697-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-03T11:08:36.697-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><title>Mechanical Turk: More SETI@Home and less Amazon Web Services</title><content type="html">&lt;div style="text-align: justify;"&gt;
A few days back, I wrote about the&lt;a href="http://www.behind-the-enemy-lines.com/2012/03/unofficial-nist-definition-of-cloud.html" target="_blank"&gt; requirements that labor markets need to satisfy in order to claim that they offer scalable "cloud labor" services&lt;/a&gt;. As a reminder, the characteristics that define cloud services are:&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;on-demand self-service&lt;/span&gt;&lt;/li&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;broad access through APIs&lt;/span&gt;&lt;/li&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;resource pooling&lt;/span&gt;&lt;/li&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;rapid elasticity&lt;/span&gt;&lt;/li&gt;
&lt;li&gt;&lt;span style="text-align: justify;"&gt;measured service&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;span style="text-align: justify;"&gt;I used Amazon Mechanical Turk for a first test of these condition, and the results were:&lt;/span&gt;&lt;br /&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;On-demand self-service&lt;/b&gt;: Yes. We can access the labor pool whenever it is needed.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Broad access through APIs&lt;/b&gt;: Yes. Computers can handle the overall process of hiring, task handling, etc.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;span style="color: #cc0000;"&gt;&lt;b&gt;Resource pooling&lt;/b&gt;: Yes and No. &lt;/span&gt;While there is a pool of workers available, there is no assignment done from the service provider. This implies that there may be nobody willing to work on the posted task and this cannot be inferred before testing the system. It is really up to the workers to decide whether they will serve a particular labor request.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;span style="color: #cc0000;"&gt;&lt;b&gt;Rapid elasticity&lt;/b&gt;: Yes and No. &lt;/span&gt;The scaling &lt;b&gt;&lt;i&gt;out &lt;/i&gt;&lt;/b&gt;capability (increasing rapidly the labor pool) is rather limited. We simply cannot suddenly hire hundreds of workers to work in parallel in a task, for a sustained period of time (workers that do 1-2 task and then leave cannot be counted for the purpose of elasticity). As in the case of resource pooling, it is up to the workers to decide whether to work on a task, and it is highly unclear what level of pricing could achieve what level of elasticity.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;span style="color: #cc0000;"&gt;&lt;b&gt;Measured Service&lt;/b&gt;: No. &lt;/span&gt;Quality and productivity measurement is done by the employer side, and there is no SLA with the client that is paying for the provided services, which could guarantee a minimum level of performance.&lt;/li&gt;
&lt;/ul&gt;
&lt;hr style="background-color: #691f01; border-bottom-width: 1px; border-color: initial; border-left-width: 1px; border-right-width: 1px; border-style: initial; border-top-width: 1px; color: #691f01; display: block; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;So, why MTurk fails these tests?&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The root cause of failure is the voluntarily, market-based mechanism for allocating labor to tasks. (Yes, markets are not necessarily efficient, especially when they are not designed properly.)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The fact that MTurk cannot "forcibly" assign a task to a worker, makes it almost impossible to ever satisfy the requirements for these conditions. If someone wants to solicit someone a large number of workers (rapid elasticity), it is not clear that the market will have enough participants to satisfy the needs. Even if they are, we do not know the wage that the available workers will require. If, however, there was a guaranteed pool available, with known prices, then MTurk could say what are the limits of elasticity, and how much it would cost. Similar for pooling.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
In a sense, today's Mechanical Turk is more similar to the SETI@Home in 1999, rather than to EC2 and S3 from Amazon in 2009. Here are the similarities:&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Distributed, voluntarily participating infrastructure&lt;/b&gt;&lt;/li&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;With Amazon Web Services (AWS) such as EC2, S3, etc. there is a single provider of hardward infrastructure, who plans for availability, does capacity planning by upgrading the infrastructure when needed, etc. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;In SETI@Home, the computation was coming from volunteers that were joining the network at their own will, and could potentially donate time to other projects beyond SETI (e.g., protein folding and others). There was no single provider of hardware capabilities, as in the Amazon case, but rather a distributed, completely heterogeneous infrastructure. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;On  Mechanical Turk(and crowdsourcing in general), every person comes and leaves at will. There is no single agency that hires all the workers and plans for availability, does capacity planning, etc.&lt;/li&gt;
&lt;/ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Diversity of underlying &lt;/b&gt; &lt;b&gt;infrastructure&lt;/b&gt;&lt;/li&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;With EC2 and S3, we have an SLA guarantee for the services we are buying. If we buy 3 m1.medium machines, Amazon provides the memory, cpu speed, and other characteristics of these machines. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;In SETI@Home, the computation was split into multiple pieces and distributed to a large number of computers, each with different capabilities. Through testing SETI was building profiles of the different machines to potentially allocate data units more efficiently.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;On Mechanical Turk, we observe the same setting today but with human tasks. We have no idea what are the skills of the underlying "human units", unless we probe and test beforehand.&lt;/li&gt;
&lt;/ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;No guarantee of "uptime" (task completion)&lt;/b&gt;&lt;/li&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;With EC2 and S3, we have a reasonable guarantee of uptime: When a service receives a request, we expect that the answer will come back, with probability following the SLA guarantees (which is very high). Very rarely we need to plan for cases where the system is unavailable; such planning is not seen as a common everyday need. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;In SETI@Home, there was no guarantee that an data unit was ever going to be returned by the client. The client may decide to uninstall the application, switch off the computer, or do any action that could interrupt the computation process. SETI was keeping track of the reliability of the machines and how often they returned their data units back, within a reasonable amount of time.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;On Mechanical Tuk, we also need to handle the fact that a task may not be completed after the assignment, may be returned and need to be reposted etc. MTurk keeps track of such failures and keeps statistics about the tasks that were returned and abandoned by each worker.&lt;/li&gt;
&lt;/ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Malicious clients&lt;/b&gt;&lt;/li&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;With EC2 and S3, we have almost a guarantee that the CPU will not misrepresent its capabilities and will always return correct results. Similarly for storage we have a 99.99999% guarantee that the data will not be lost. We may maintain multiple servers for a service, mainly as an attempt to increase reliability and have load balacing, but we start with the understanding that even the first machine will operate in a “best effort” basis and will not behave maliciously. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;In SETI@Home, there were many attempts from people to game the system and return back non-properly processed data, just to increase their statistics and place in the standings. To avoid malicious clients, SETI was performing the computation multiple times, effectively wasting the available computing capacity for reliability purposes. &lt;/li&gt;
&lt;li style="text-align: justify;"&gt;We observe the same thing with Mechanical Turk. Instead of trusting each individual to do an honest effort, we need to resort to redundancy, gold tests, and so on, effectively wasting capacity. The introduction of "trusted" workers (Mechanical Turk masters) reduces the problem but the fundamental problem is still there.&lt;/li&gt;
&lt;/ul&gt;
&lt;/ul&gt;
&lt;hr style="background-color: #691f01; border-bottom-width: 1px; border-color: initial; border-left-width: 1px; border-right-width: 1px; border-style: initial; border-top-width: 1px; color: #691f01; display: block; height: 2px; text-align: justify;" width="50%" /&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;So, what is the future?&amp;nbsp;&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
The naive solution is to have a "traditional" outsourcing service, sending tasks to a classic &lt;a href="http://en.wikipedia.org/wiki/Business_process_outsourcing" target="_blank"&gt;BPO&lt;/a&gt; company such as &lt;a href="http://en.wikipedia.org/wiki/Tata_Consultancy_Services" target="_blank"&gt;Tata Consulting&lt;/a&gt;, and rely on their reliability and availability guarantees. (Interestingly enough, many of these BPO's use crowdsourcing-like approaches to manage internally their tens of thousands of employees that handle basic tasks.)While I see the appeal, I do not find the solution satisfactory.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Personally, I see a supply side market to emerge in which workers can advertise what they offer and clients can place requests against these services. (&lt;a href="http://fiverr.com/" target="_blank"&gt;Fiverr&lt;/a&gt; is currently offering such a "supply-side" service, which mirrors the "demand-side" service offered by Mechanical Turk.) The service that will&amp;nbsp;successfully&amp;nbsp;merge the two sides and connect efficiently supply and demand will be the winner...&lt;/div&gt;
&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=NcMta9hTt7o:PjV7__qXuL8:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=NcMta9hTt7o:PjV7__qXuL8:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=NcMta9hTt7o:PjV7__qXuL8:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=NcMta9hTt7o:PjV7__qXuL8:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=NcMta9hTt7o:PjV7__qXuL8:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=NcMta9hTt7o:PjV7__qXuL8:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=NcMta9hTt7o:PjV7__qXuL8:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/NcMta9hTt7o" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/1242349411757255342?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/1242349411757255342?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/NcMta9hTt7o/mechanical-turk-more-setihome-and-less.html" title="Mechanical Turk: More SETI@Home and less Amazon Web Services" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/03/mechanical-turk-more-setihome-and-less.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0ACR3k6fip7ImA9WhVQFEs.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-3173950864713452666</id><published>2012-03-22T14:49:00.001-04:00</published><updated>2012-04-03T11:09:26.716-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-03T11:09:26.716-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="conference" /><category scheme="http://www.blogger.com/atom/ns#" term="academia" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><title>ACM EC 2012 Workshops</title><content type="html">Thursday, June 7th, 2012:&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;Workshop on Risk-aversion and Non Expected Utility Theories in Algorithmic Game Theory &amp;amp; Mechanism Design (&lt;a href="http://faculty.cse.tamu.edu/nikolova/Risk-workshop-ACM-EC-2012/"&gt;http://faculty.cse.tamu.edu/nikolova/Risk-workshop-ACM-EC-2012/&lt;/a&gt;) Paper submission deadline: April 10&lt;/li&gt;
&lt;li&gt;Workshop on Social Computing and User Generated Content (&lt;a href="http://yiling.seas.harvard.edu/sc2012/index.html"&gt;http://yiling.seas.harvard.edu/sc2012/index.html&lt;/a&gt;) Paper submission deadline: April 9&lt;/li&gt;
&lt;/ul&gt;
Friday, June 8th, 2012:&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;Ad Auction Workshop (&lt;a href="http://sites.google.com/site/adauctions2012/"&gt;http://sites.google.com/site/adauctions2012/&lt;/a&gt;) Paper submission deadline: April 7&lt;/li&gt;
&lt;li&gt;Workshop on Incentives and Trust in E-Commerce (&lt;a href="http://trust.sce.ntu.edu.sg/wit-ec12/"&gt;http://trust.sce.ntu.edu.sg/wit-ec12/&lt;/a&gt;) Paper submission deadline: April 5&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=bcrgbz1buYY:HztLYDrLQZE:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=bcrgbz1buYY:HztLYDrLQZE:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=bcrgbz1buYY:HztLYDrLQZE:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=bcrgbz1buYY:HztLYDrLQZE:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=bcrgbz1buYY:HztLYDrLQZE:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=bcrgbz1buYY:HztLYDrLQZE:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=bcrgbz1buYY:HztLYDrLQZE:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/bcrgbz1buYY" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/3173950864713452666?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/3173950864713452666?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/bcrgbz1buYY/acm-ec-2012-workshops.html" title="ACM EC 2012 Workshops" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/03/acm-ec-2012-workshops.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D08ERno5fip7ImA9WhVQFEs.&quot;"><id>tag:blogger.com,1999:blog-7118563403027467631.post-4367743877451582754</id><published>2012-03-22T00:33:00.000-04:00</published><updated>2012-04-03T11:10:07.426-04:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-04-03T11:10:07.426-04:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="mechanical turk" /><category scheme="http://www.blogger.com/atom/ns#" term="crowdsourcing" /><category scheme="http://www.blogger.com/atom/ns#" term="wisdom of the crowds" /><category scheme="http://www.blogger.com/atom/ns#" term="research" /><title>The (Unofficial) NIST Definition of Crowdsourcing</title><content type="html">&lt;div style="text-align: justify;"&gt;
A few weeks ago, I was attending the &lt;a href="http://dmlab.cs.umn.edu/SocialMobileCloud/index.html" target="_blank"&gt;NSF Workshop on Social Networks and Mobility in the Cloud&lt;/a&gt;. There,&amp;nbsp;I ran into the &lt;a href="http://csrc.nist.gov/publications/nistpubs/800-145/SP800-145.pdf" target="_blank"&gt;NIST definition of cloud computing&lt;/a&gt;.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
After reading it, I felt that it would be a nice exercise to transform the definition into something similar for the dual area of "cloud labor" (aka crowdsourcing). I found it to be a useful exercise. While the NIST definition is focused and is &amp;nbsp;highlighting features that are commonly available in &lt;i&gt;&lt;b&gt;computing &lt;/b&gt;&lt;/i&gt;services, they do have have corresponding interpretations within the framework of "cloud labor". At the same time, we can also see that there are significant differences, as there are fundamental differences between humans and computers.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Anyway, here is my attempt to take the NIST definition, and translate into a similar definition for crowdsourcing. Intentionally, I am plagiarizing the NIST definition, introducing changes only where necessary.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
In the definition, I am trying to use the term "worker" for the person doing the job, the term "client" for the person that is paying for the labor, and "service provider" for the platforms that connect clients and workers.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;span style="color: #990000;"&gt;The (Unofficial) NIST Definition of Cloud Labor / Crowdsourcing&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Cloud labor is a model for enabling convenient, on-demand network access to a (shared) pool of human workers with different skills (e.g., transcribers, translators, developers, virtual assistants, graphic designers, etc) that can be rapidly provisioned and released with minimal management effort or service provider interaction. This cloud model promotes availability and is composed of five essential characteristics, three service models, and four deployment models.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;i&gt;Essential Characteristics&lt;/i&gt;&lt;/b&gt;&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;On-demand self-service&lt;/b&gt;. A client can unilaterally provision labor capabilities, (e.g., as virtual assistants, content moderators, developers, and so on) as needed automatically without requiring human interaction with service’s provider.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Broad access&lt;/b&gt;. Capabilities are available and accessed through standard mechanisms that promote use by heterogeneous thin or thick client platforms (e.g., from PhD students hiring for a small survey, to companies such as uTest and TopCoder that engage deeply their workers)&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Resource pooling&lt;/b&gt;. The labor resources are pooled by the service provider to serve multiple clients using a multi-tenant model, with different workers dynamically assigned and reassigned according to employer demand. There is a sense of location and time independence in that the client generally has no control or knowledge over the exact location of the provided labor but may be able to specify location and other desirable qualifications at a higher level of abstraction (e.g., country, language knowledge, or skill proficiency).&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Rapid elasticity&lt;/b&gt;. Labor can be rapidly and elastically provisioned, in some cases automatically, to quickly scale out and rapidly released to quickly scale in. To the client, the labor capabilities available for provisioning often appear to be unlimited and can be purchased in any quantity at any time.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Measured service&lt;/b&gt;. Labor cloud provision systems automatically control and optimize resource use by leveraging a metering capability at some level of abstraction appropriate to the type of service (e.g., content generation, translation, software development, etc). Resource usage can be monitored, controlled, and reported providing transparency for both the service provider, the client and the worker, so that there is a better understanding of the quality of the provisioned labor services.&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;i&gt;Service Models&lt;/i&gt;&lt;/b&gt;&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Labor Applications/Software as a Service (LSaaS).&lt;/b&gt;&amp;nbsp;The capability provided to the client is to use the provider’s applications running on a cloud-labor infrastructure. The applications are accessible from various client devices through a thin client interface such as a web browser (e.g., web application for ordering content generation, or proofreading, or transcription, or software testing, or ...). The client does not manage or control the underlying cloud labor, with the possible exception of limited user-specific application configuration settings. Effectively, the client only cares about the quality of the provided &lt;i&gt;results &lt;/i&gt;of the labor and does not want to know about the underlying workflows, quality management, etc. [Companies like CastingWords and uTest fall into this category]&lt;/li&gt;
&lt;li&gt;&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Labor Platform as a Service (LPaaS).&lt;/b&gt;&amp;nbsp; The capability provided to the client is to deploy onto the labor pool consumer-created or acquired applications created using programming languages and tools supported by the provider. The client does not manage or control the underlying labor pool, but has control of the overall task execution, including workflows, quality control, etc. The platform provides the necessary infrastructure to support the generation and implementation of the task execution logic.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
[Companies like Humanoid fall into this category]&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;&lt;div style="text-align: justify;"&gt;
&lt;b&gt;Labor Infrastructure as a Service (LIaaS).&lt;/b&gt;&amp;nbsp;The capability provided to the client is to provision labor for the client, who then allocates workers to tasks. The consumer of labor services does not get involved with the recruiting process or the details of payment, but has full control everything else. Much like the Amazon Web Services approach (use EC2, S3, RDS, etc. to build your app), the service provider just provides raw labor and guarantees that the labor force satisfies a particular SLA (e.g., response time within X minutes, has the skills that are advertised in the resume, etc)&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
[Companies like Amazon Mechanical Turk fall into this category]&amp;nbsp;&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
&lt;i&gt;&lt;b&gt;Deployment Models&lt;/b&gt;&lt;/i&gt;&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Private labor pool&lt;/b&gt;. The labor pool is operated solely for an organization. It may be managed by the organization or a third party and may exist on premise or off premise.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Community labor pool&lt;/b&gt;. The labor pool is shared by several organizations and supports a specific community that has shared concerns (e.g., enthusiasts of an application such as birdwatchers, or volunteers for a particular cause such as disaster management). It may be managed by the organizations or a third party and may exist on premise or off premise.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Public labor pool&lt;/b&gt;. The labor pool is made available to the general public or a large industry group and is provisioned by an organization (or coalition of organizations) selling labor services.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Hybrid labor pool&lt;/b&gt;. The labor pool is a composition of two or more pools (private, community, or public) that remain unique entities but are bound together by standardized or proprietary technology that enables data and application portability (e.g., handling activity bursts by fetching public labor to support the private labor pool of a company).&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;i&gt;Differences between a Computing and Labor Cloud&lt;/i&gt;&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;i&gt;&lt;br /&gt;
&lt;/i&gt;&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style="text-align: justify;"&gt;
The NIST definition highlights some of the key aspects of a "cloud labor" service. However, by&amp;nbsp;omission,&amp;nbsp;it also illustrates some key differences that we need to take into consideration when thinking about "cloud labor" services.&lt;/div&gt;
&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Need for training and lack of instantaneous duplication&lt;/b&gt;. In the computing cloud we can pre-configure computing units with a specific software installation (e.g. with a LAMP stack) and then replicate as necessary to meet the needs of the application. With human workers, the equivalent of software installation part is training. The key difference is that training takes time and we cannot “store the image and replicate as needed.” So, for cases where an client wants the workers to have a task-specific training, we will observe a latency in starting the task completion equal to the time necessary for training the worker to learn the requirements specific to the given task. When training is specific to the client, this latency can be significant. When training is transferable across clients, things are expected to be a better, assuming a well-functioning and designed market.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Allocation over space&lt;/b&gt;. In computing cloud we can request allocation of services in different geographical locations, but this is a &lt;i&gt;desirable &lt;/i&gt;and&lt;i&gt; not a key feature&lt;/i&gt;. With human labor though, &lt;i&gt;especially when it contains an offline component&lt;/i&gt;, we may need to explicitly request specific geographic regions.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Allocation over time&lt;/b&gt;. With computing services, time is of little importance, excluding the normal part of load fluctuations over time of day, and days of the week. Furthermore, we can easily operate a computing device 24/7. With human labor, this is not possible. Not only we have to face the fact that humans get tired but also humans typically are available for work during the “working hours” of their timezone. Since we cannot take a person and replicate across time zones, this becomes a crucial difference when we expect real-time on-demand labor services around the clock.&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
&lt;b&gt;&lt;i&gt;How Mature are Today's Online Labor Markets?&lt;/i&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
If we examine the existing “labor cloud” we will see that many of the characteristics that define the computing cloud (on-demand self-service, broad access through APIs, resource pooling, rapid elasticity, and measured service) only a subset of the capabilities are available through&amp;nbsp;today's&amp;nbsp;labor platforms.&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
Take the case of Amazon Mechanical Turk:&lt;/div&gt;
&lt;ul&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;On-demand self-service&lt;/b&gt;: Yes.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Broad access through APIs&lt;/b&gt;: Yes&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Resource pooling&lt;/b&gt;: Yes and No. While there is a pool of workers available, there is no assignment done from the service provider. This implies that there may be nobody willing to work on the posted task and this cannot be inferred before testing the system. It is really up to the workers to decide whether they will serve a particular labor request.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Rapid elasticity&lt;/b&gt;: Yes and No. The scaling out capability is rather limited (scaling in is trivially easy). As in the case of resource pooling, it is up to the workers to decide whether to work on a task.&lt;/li&gt;
&lt;li style="text-align: justify;"&gt;&lt;b&gt;Measured Service&lt;/b&gt;: No. Quality and productivity measurement is done by the employer side.&lt;/li&gt;
&lt;/ul&gt;
&lt;div style="text-align: justify;"&gt;
2 yes, 1 no, and 2 "yes and no". Glass half-full? Glass half-empty? I will go for the half-full interpretation for now but we can see that we still have a long way to go.&lt;/div&gt;&lt;div class="feedflare"&gt;
&lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=xI86IwITNNc:W_IYzxbeJ3c:yIl2AUoC8zA"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=yIl2AUoC8zA" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=xI86IwITNNc:W_IYzxbeJ3c:BZkkm1Y4jn8"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=xI86IwITNNc:W_IYzxbeJ3c:BZkkm1Y4jn8" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=xI86IwITNNc:W_IYzxbeJ3c:F7zBnMyn0Lo"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?i=xI86IwITNNc:W_IYzxbeJ3c:F7zBnMyn0Lo" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=xI86IwITNNc:W_IYzxbeJ3c:qj6IDK7rITs"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=qj6IDK7rITs" border="0"&gt;&lt;/img&gt;&lt;/a&gt; &lt;a href="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?a=xI86IwITNNc:W_IYzxbeJ3c:I9og5sOYxJI"&gt;&lt;img src="http://feeds.feedburner.com/~ff/AComputerScientistInABusinessSchool?d=I9og5sOYxJI" border="0"&gt;&lt;/img&gt;&lt;/a&gt;
&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/AComputerScientistInABusinessSchool/~4/xI86IwITNNc" height="1" width="1"/&gt;</content><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/4367743877451582754?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/7118563403027467631/posts/default/4367743877451582754?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/AComputerScientistInABusinessSchool/~3/xI86IwITNNc/unofficial-nist-definition-of-cloud.html" title="The (Unofficial) NIST Definition of Crowdsourcing" /><author><name>Panos Ipeirotis</name><uri>https://plus.google.com/103666871486129948108</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh4.googleusercontent.com/-dIWj8iHQSKU/AAAAAAAAAAI/AAAAAAAA0Ro/MROYPWvY51A/s512-c/photo.jpg" /></author><feedburner:origLink>http://www.behind-the-enemy-lines.com/2012/03/unofficial-nist-definition-of-cloud.html</feedburner:origLink></entry></feed>
