<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/atom10full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" gd:etag="W/&quot;A0ENQ384eSp7ImA9WhFTGUk.&quot;"><id>tag:blogger.com,1999:blog-36589303</id><updated>2013-06-11T13:21:32.131+02:00</updated><category term="Personal" /><category term="Phishing" /><category term="Evaluation" /><category term="Robótica" /><category term="English" /><category term="Captcha" /><category term="Information Retrieval" /><category term="Machine Learning" /><category term="WEKA" /><category term="Imagen" /><category term="Tutorial" /><category term="Security" /><category term="Programming" /><category term="Web Filtering" /><category term="Security as a Service" /><category term="Opinion" /><category term="Text Mining" /><category term="Children Protection" /><category term="Software libre" /><category term="Resources" /><category term="Opensource" /><category term="Virus" /><category term="Smartphone" /><category term="Evaluacion" /><category term="Privacy" /><category term="Eventos" /><category term="Parental Control" /><category term="Humor" /><category term="Events" /><category term="Spam" /><category term="Recuperación de Información" /><category term="NLP" /><category term="Internet" /><category term="Search Engines" /><category term="Cloud Computing" /><category term="Data Mining" /><category term="Imaging" /><category term="Culture" /><category term="Social Networks" /><category term="Online Advertising" /><category term="Privacidad" /><category term="Papers" /><category term="Protección del menor" /><category term="Opinion Mining" /><category term="Cultura" /><category term="Control parental" /><category term="Redes Sociales" /><category term="Recommender Systems" /><category term="Trivia" /><category term="Opinión" /><category term="Seguridad" /><category term="Recommendation" /><category term="Tecnología" /><category term="CFP" /><category term="Biomedicine" /><title>Nihil Obstat</title><subtitle type="html">&lt;b&gt;Blog de José María Gómez Hidalgo&lt;/b&gt;&lt;br&gt;&lt;br&gt;
Mis reflexiones sobre tecnología e Internet, seguridad e inteligencia artificial.&lt;br&gt;
My opinions about technology, Internet, security and Artificial Intelligence</subtitle><link rel="http://schemas.google.com/g/2005#feed" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/posts/default" /><link rel="alternate" type="text/html" href="http://jmgomezhidalgo.blogspot.com/" /><link rel="next" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default?start-index=26&amp;max-results=25&amp;redirect=false&amp;v=2" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><generator version="7.00" uri="http://www.blogger.com">Blogger</generator><openSearch:totalResults>380</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/atom+xml" href="http://feeds.feedburner.com/NihilObstat" /><feedburner:info uri="nihilobstat" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><entry gd:etag="W/&quot;A0ENQ38_fyp7ImA9WhFTGUk.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-4624770148145284411</id><published>2013-06-11T13:21:00.001+02:00</published><updated>2013-06-11T13:21:32.147+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-06-11T13:21:32.147+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Opinion Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Baseline Sentiment Analysis with WEKA</title><content type="html">&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/opinion.mining.headpic.png" style="DISPLAY: inline" height="143" width="447"/&gt;&lt;/p&gt;
&lt;p&gt;&lt;a href="http://en.wikipedia.org/wiki/Sentiment_analysis" target="_blank"&gt;Sentiment Analysis (and/or Opinion Mining)&lt;/a&gt; is one of the hottest topics in &lt;a href="http://en.wikipedia.org/wiki/Natural_language_processing" target="_blank"&gt;Natural Language Processing&lt;/a&gt; nowadays. The task, defined in a simplistic way, consists of determining the polarity of a text utterance according to the opinion or sentiment of the speaker or writer, as positive or negative. This task has multiple applications, including e.g. Customer Relationship Management or predicting political elections.&lt;/p&gt;
&lt;p&gt;While initial results dating back to early 2000 seem very promising, it is not such a simple task. We face from &lt;a href="http://deepthoughtinc.com/wp-content/uploads/2011/01/Twitter-as-a-Corpus-for-Sentiment-Analysis-and-Opinion-Mining.pdf" target="_blank"&gt;the informal Twitter language&lt;/a&gt; to the fact that &lt;a href="http://times.cs.uiuc.edu/czhai/pub/www07-sent.pdf" target="_blank"&gt;opinions can be faceted&lt;/a&gt; (for instance, I may like the software but not the hardware of a device), or &lt;a href="http://www.cs.uic.edu/~liub/FBS/fake-reviews.html" target="_blank"&gt;opinion spam and fake reviews&lt;/a&gt;, along with traditional and complex problems in Natural Language Processing as irony, sarcasm or negation. For a good overview of the task, please check &lt;a href="http://www.cs.cornell.edu/home/llee/opinion-mining-sentiment-analysis-survey.html" target="_blank"&gt;the survey paper on opinion mining and sentiment analysis by Bo Pang and Lillian Lee&lt;/a&gt;. A more practical overview is the &lt;a href="http://alias-i.com/lingpipe/demos/tutorial/sentiment/read-me.html" target="_blank"&gt;Sentiment Tutorial with LingPîpe by Alias-i&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;In general, there are two main approaches to this task:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Counting and/or weighting sentiment-related words that have been evaluated and tagged by experts, conforming a lexical collection like &lt;a href="http://sentiwordnet.isti.cnr.it/" target="_blank"&gt;SentiWordNet&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Learning a text classifier on a previously labelled text collection, like e.g. the &lt;a href="http://www.sfu.ca/~mtaboada/research/SFU_Review_Corpus.html" target="_blank"&gt;SFU Review Corpus&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The SentiWordNet home page offers &lt;a href="http://sentiwordnet.isti.cnr.it/code/SWN3.java" target="_blank"&gt;a simple Java program that follows the first approach&lt;/a&gt;. I will follow the second one in order to show how to use an essential WEKA text mining class (&lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/converters/TextDirectoryLoader.html" target="_blank"&gt;weka.core.converters.TextDirectoryLoader&lt;/a&gt;&lt;/code&gt;), and to provide another example of the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;weka.filters.unsupervised.attribute.StringToWordVector&lt;/a&gt;&lt;/code&gt; class.&lt;/p&gt;
&lt;p&gt;I will follow the process outlined in &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html" target="_blank"&gt;the previous post about Language Identification using WEKA&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Data Collection and Preprocessing&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;For this demonstration, I will make use of a relatively small but interesting dataset named &lt;a href="http://www.sfu.ca/~mtaboada/research/SFU_Review_Corpus.html" target="_blank"&gt;the SFU Review Corpus&lt;/a&gt;. This corpus consists of 400 reviews in English extracted from the &lt;em&gt;Epinions&lt;/em&gt; website in 2004 divided in 25 positive and 25 negative reviews for each of 8 product categories (Books, Cars, Computers, etc.). It also contains 400 reviews in Spanish extracted from &lt;em&gt;Ciao.es&lt;/em&gt; divided in the same categories (except for the Cookware category in English, which --more or less-- maps to Lavadoras --Washing Machines-- in Spanish).&lt;/p&gt;
&lt;p&gt;The original format of the collections is one directory per category of products, including 25 positive reviews including the word "yes" in the file name and 25 negative reviews including the word "no" in the file name. Unfortunately, this format does not allow to work directly with it in WEKA, but a couple of handy scripts transform it into a new format: two directories, one including the positive reviews (directory &lt;code&gt;yes&lt;/code&gt;), and the other one including the negative reviews (directory &lt;code&gt;no&lt;/code&gt;). I have kept the category in the name of the files (with patterns like &lt;code&gt;bookyes1.txt&lt;/code&gt;) in order to allow others making a more detailed analysis per category.&lt;/p&gt;
&lt;p&gt;Comparing the structure of the original and the new format of the text collections:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/structure.collections.sfu.opinion.mining.png" style="DISPLAY: inline" height="202" width="180"/&gt;&lt;/p&gt;
&lt;p&gt;In order to construct an &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html" target="_blank"&gt;ARFF&lt;/a&gt; file from this structure, we can use the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/converters/TextDirectoryLoader.html" target="_blank"&gt;weka.core.converters.TextDirectoryLoader&lt;/a&gt;&lt;/code&gt; class, which is an evolution of a previously existing helper class named &lt;code&gt;&lt;a href="http://weka.wikispaces.com/Text+categorization+with+WEKA" target="_blank"&gt;TextDirectoryToArff.java&lt;/a&gt;&lt;/code&gt; and available at &lt;a href="http://weka.wikispaces.com/" target="_blank"&gt;WEKA Documentation at wikispaces&lt;/a&gt;. Using this class is as simple as issuing the next command:&lt;/p&gt;
&lt;blockquote style="MARGIN-RIGHT: 0px" dir="ltr"&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.core.converters.TextDirectoryLoader -dir SFU_Review_Corpus_WEKA &amp;gt; SFU_Review_Corpus.arff&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;You have to call this command at the parent directory of &lt;code&gt;SFU_Review_Corpus_WEKA&lt;/code&gt;, and the parameter &lt;code&gt;-dir&lt;/code&gt; sets up the input directory. This class expects to have a single directory containing a directory per class value (&lt;code&gt;yes&lt;/code&gt; and &lt;code&gt;no&lt;/code&gt; in our case), which in turn should contain a number of files pertaining to the corresponding classes. As the output of this command goes to the standard output, I have to redirect it to a file.&lt;/p&gt;
&lt;p&gt;I have left the output of the execution of this command for both the English (&lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/OpinionMining/SFU_Review_Corpus.arff" target="_blank"&gt;SFU_Review_Corpus.arff&lt;/a&gt;&lt;/code&gt;) and the Spanish (&lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/OpinionMining/SFU_Spanish_Review.arff" target="_blank"&gt;SFU_Spanish_Review.arff&lt;/a&gt;&lt;/code&gt;) collections at &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/OpinionMining" target="_blank"&gt;the OpinionMining folder&lt;/a&gt; of &lt;a href="https://github.com/jmgomezh/tmweka" target="_blank"&gt;my GitHub repository&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Data Analysis&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Previous models in my blog posts have been based on a relatively simple representation of texts as sequences of words. However, a trivial analysis of the problem easily drives us to think that multi-word expressions (e.g. "very bad" vs. "bad", or "a must" vs. "I must") can lead to better predictors of user sentiment or opinion about an item. Because of this, we will compare word n-grams vs. single words (or unigrams). As an basic set up, I propose to compare word unigrams, 3-grams, and 1-to-3-grams. The latter representation will include uni- to 3-grams with the hope of getting the best of all of them.&lt;/p&gt;
&lt;p&gt;Keeping in ming that capitalization may matter in this problem ("BAD" is worse than "bad"), and that we can use standard punctuation (for each of the languages) as texts are long comments (several paragraphs each), I derive the following calls to the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;weka.filters.unsupervised.attribute.StringToWordVector&lt;/a&gt; class:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -O -tokenizer "weka.core.tokenizers.NGramTokenizer -delimiters \"&lt;a&gt;\\\\W\&lt;/a&gt;" -min 1 -max 1" -W 10000000 -i SFU_Review_Corpus.arff -o SFU_Review_Corpus.vector.uni.arff
&lt;br/&gt;
$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -O -tokenizer "weka.core.tokenizers.NGramTokenizer -delimiters \"&lt;a&gt;\\\\W\&lt;/a&gt;" -min 3 -max 3" -W 10000000 -i SFU_Review_Corpus.arff -o SFU_Review_Corpus.vector.tri.arff
&lt;br/&gt;
$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -O -tokenizer "weka.core.tokenizers.NGramTokenizer -delimiters \"&lt;a&gt;\\\\W\&lt;/a&gt;" -min 1 -max 3" -W 10000000 -i SFU_Review_Corpus.arff -o SFU_Review_Corpus.vector.unitri.arff&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;We follow the notation &lt;code&gt;vector.uni&lt;/code&gt; to denote that the dataset is vectorized and that we are using word unigrams, and so on. The calls for the Spanish collection are similar to these ones.&lt;/p&gt;
&lt;p&gt;The most important thing in these calls is that we are no longer using the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/tokenizers/WordTokenizer.html" target="_blank"&gt;weka.core.tokenizers.WordTokenizer&lt;/a&gt;&lt;/code&gt; class. Instead, we are using &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/tokenizers/NGramTokenizer.html" target="_blank"&gt;weka.core.tokenizers.NGramTokenizer&lt;/a&gt;&lt;/code&gt;, which uses the options &lt;code&gt;-min&lt;/code&gt; and &lt;code&gt;-max&lt;/code&gt; to set the minimum and maximum size of the n-grams. But the most important thing is that there is a major difference between both classes, regarding the usage of delimiters:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;code&gt;weka.core.tokenizers.WordTokenizer&lt;/code&gt; class uses the deprecated Java class &lt;code&gt;&lt;a href="http://docs.oracle.com/javase/6/docs/api/java/util/StringTokenizer.html" target="_blank"&gt;java.util.StringTokenizer&lt;/a&gt;&lt;/code&gt; , even in the latest versions of the WEKA package (as of the day of this writing). In &lt;code&gt;StringTokenizer&lt;/code&gt;, the delimiters are the characters used as "spaces" to tokenize the input string: white space, punctuation marks, etc. So you have to explicitly define which will be the "spaces" in your text.&lt;/li&gt;
&lt;li&gt;The &lt;code&gt;weka.core.tokenizers.NGramTokenizer&lt;/code&gt; class uses the recommended Java String method &lt;code&gt;&lt;a href="http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#split(java.lang.String)" target="_blank"&gt;String[] split(String regex)&lt;/a&gt;&lt;/code&gt; , in which the argument (and thus the delimiters string) is a &lt;a href="http://en.wikipedia.org/wiki/Regular_expression" target="_blank"&gt;Regular Expression&lt;/a&gt; (regex) in Java. The text is splitted into tokens separated by substrings that match the regex, so you can use all the power of regexes including e.g. special codes for characters. In this case I am using the code &lt;code&gt;\W&lt;/code&gt; which denotes any non-word character, in order to get only alpha-numeric character sequences.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;After splitting the text into word n-grams (or more properly, after representing the texts as term-weight vectors in our Vector Space Model), we may want to examine which n-grams are most predictive. As &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html" target="_blank"&gt;in the Language Identification post&lt;/a&gt;, we make use of the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;weka.filters.supervised.attribute.AttributeSelection&lt;/a&gt;&lt;/code&gt; class:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.supervised.attribute.AttributeSelection -c 1 -E weka.attributeSelection.InfoGainAttributeEval -S "weka.attributeSelection.Ranker -T 0.0" -i SFU_Review_Corpus.vector.uni.arff -o SFU_Review_Corpus.vector.uni.ig0.arff
&lt;br/&gt;
$&amp;gt; java weka.filters.supervised.attribute.AttributeSelection -c 1 -E weka.attributeSelection.InfoGainAttributeEval -S "weka.attributeSelection.Ranker -T 0.0" -i SFU_Review_Corpus.vector.tri.arff -o SFU_Review_Corpus.vector.tri.ig0.arff
&lt;br/&gt;
$&amp;gt; java weka.filters.supervised.attribute.AttributeSelection -c 1 -E weka.attributeSelection.InfoGainAttributeEval -S "weka.attributeSelection.Ranker -T 0.0" -i SFU_Review_Corpus.vector.unitri.arff -o SFU_Review_Corpus.vector.unitri.ig0.arff&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;After the selection of the most predictive n-grams, we get the following statistics in the test collections:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/opinion.mining.term.stats.png" style="DISPLAY: inline" height="171" width="240"/&gt;&lt;/p&gt;
&lt;p&gt;The percentages in rows 3-6-9 measure the agressivity of feature selection. Overall, both collections have comparable statistics (in the same order of magnitude). Original unigrams are quite similar, but bigrams and trigrams are less in Spanish (despite the fact that there are more isolated words -- unigrams). Selecting n-grams with Information Gain is a bit more aggressive in Spanish for unigrams and possible bigrams, but less in trigrams.&lt;/p&gt;
&lt;p&gt;Adding bigrams and trigrams to the representation substantially increases the number of predictive features (from 4 to 5 times). However, only trigrams result in a little increment of features, so bigrams will play a role here. The number of features is quite handy, and allows us to make quick experiments.&lt;/p&gt;
&lt;p&gt;According to my previous post on &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;setting up experiments with WEKA text classifiers and how to chain filters and classifiers&lt;/a&gt;, you must note that these are not the final features if we configure a cross-validation experiment -- we have to chain the filters (&lt;code&gt;StringToWordVector&lt;/code&gt; and &lt;code&gt;AttributeSelection&lt;/code&gt;) and the classifier in order to perform a valid experiment, as the features for each folder should be different.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Experiments and Results&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;In order to simplify the example, and expecting to get good results, we will use &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html" target="_blank"&gt;the same algorithms we used in the Language Identification problem&lt;/a&gt;. These are: Naive Bayes (NB, &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/bayes/NaiveBayes.html" target="_blank"&gt;weka.classifiers.bayes.NaiveBayes&lt;/a&gt;&lt;/code&gt;), PART (&lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/rules/PART.html" target="_blank"&gt;weka.classifiers.rules.PART&lt;/a&gt;&lt;/code&gt;), J48 (&lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/trees/J48.html" target="_blank"&gt;weka.classifiers.trees.J48&lt;/a&gt;&lt;/code&gt;), k-Nearest Neighbors (&lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/lazy/IBk.html" target="_blank"&gt;weka.classifiers.lazy.IBk&lt;/a&gt;&lt;/code&gt;) with k = 1,3,5, and Support Vector Machines (&lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/functions/SMO.html" target="_blank"&gt;weka.classifiers.functions.SMO&lt;/a&gt;&lt;/code&gt;); all of them with the default options, except for kNN which uses 1, 3 and 5 neighbors. I am testing the three proposed representations (based on unigrams, trigrams and 1-3grams) by 10-fold cross-validation. An example experiment command line is the following one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.classifiers.meta.FilteredClassifier -F "weka.filters.MultiFilter -F \"weka.filters.unsupervised.attribute.StringToWordVector -O -tokenizer &lt;a&gt;\\\"weka.core.tokenizers.NGramTokenizer&lt;/a&gt; -delimiters &lt;a&gt;\\\\\\\"\\\\\\\W\\\\\\\&lt;/a&gt;" -min 1 -max 1\\\" -W 10000000\" -F \"weka.filters.supervised.attribute.AttributeSelection -E weka.attributeSelection.InfoGainAttributeEval -S &lt;a&gt;\\\"weka.attributeSelection.Ranker&lt;/a&gt; -T 0.0\\\"\"" -W weka.classifiers.bayes.NaiveBayes -v -i -t SFU_Review_Corpus.arff &amp;gt; tests/uniNB.txt&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;You can change the size of n-grams with the &lt;code&gt;-min&lt;/code&gt; and &lt;code&gt;-max&lt;/code&gt; parameters. Also, you can change the learning algorithm with the most external &lt;code&gt;-W&lt;/code&gt; option. I am storing the results in a &lt;code&gt;tests&lt;/code&gt; folder, in files with the convention &lt;code&gt;&amp;lt;rep&amp;gt;&amp;lt;alg&amp;gt;.txt&lt;/code&gt;. The results of this test for the English language collection are the following ones:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/opinion.mining.results.english.png" style="DISPLAY: inline" height="376" width="338"/&gt;&lt;/p&gt;
&lt;p&gt;Considering the class &lt;code&gt;yes&lt;/code&gt; (positive sentiment) as the positive class, in each column we show the True Positives (hits on the &lt;code&gt;yes&lt;/code&gt; class), False Positives (members of the &lt;code&gt;no&lt;/code&gt; class mistakenly classified as &lt;code&gt;yes&lt;/code&gt;), False Negatives (members of the &lt;code&gt;yes&lt;/code&gt; class mistakenly classified as &lt;code&gt;no&lt;/code&gt;) and True Negatives (hits on the &lt;code&gt;no&lt;/code&gt; class); along with the &lt;a href="http://datamin.ubbcluj.ro/wiki/index.php/Evaluation_methods_in_text_categorization" target="_blank"&gt;macro-averaged&lt;/a&gt; &lt;a href="http://en.wikipedia.org/wiki/F1_score" target="_blank"&gt;F1&lt;/a&gt; (standard average F1 over both classes) and the general &lt;a href="http://en.wikipedia.org/wiki/Accuracy_and_precision" target="_blank"&gt;accuracy&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Additionally, the results for the Spanish language collection are the following ones:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/opinion.mining.results.spanish.png" style="DISPLAY: inline" height="376" width="341"/&gt;&lt;/p&gt;
&lt;p&gt;So these are the results. Let us start the analysis...&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Results Analysis&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;We can perform an analysis regarding different aspects:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Which is the overall performance?&lt;/li&gt;
&lt;li&gt;Which is the performance when comparing different languages?&lt;/li&gt;
&lt;li&gt;Which are the best learning algorithms?&lt;/li&gt;
&lt;li&gt;Which effect do have different text representations in the classifier performance?&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;All in all, and taking into account that class balance is 50% (thus a trivial acceptor or a trivial rejector, or a random classifier accuracy would be 50%), most of the classifiers beat this baseline but not by a wide margin, and even the best one among all algorithms, languages and representations (SVMs on English 1-to-3-grams) reaches only a modest 71% -- far from a satisfying 90% or over. Let me remind we are facing a relatively simple problem -- long, few texts, and a binary classification. Most approaches in the literature get much better results in similar setups.&lt;/p&gt;
&lt;p&gt;Results are better for English than for Spanish, comparing one on one. I will check the representations used in Spanish, for instance listing the first 20 n-grams for each representation, in order to explain it:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/opinion.mining.top.spanish.terms.png" style="DISPLAY: inline" height="339" width="290"/&gt;&lt;/p&gt;
&lt;p&gt;Some of the n-grams (highlighted in &lt;em&gt;italics&lt;/em&gt;) are just incorrect, because of the incorrect recognition of accents due to the inappropriate pattern I have used in the tokenization step. The tokenizer makes use of the string "&lt;code&gt;\W&lt;/code&gt;" in order to recognize alphanumeric string -- which in Java do not include vowels with accents ("á", "é", "í", "ó", "ú") and other language-specific symbols (e.g. "ñ"). Most of the n-grams are just not opinionated words or n-grams; instead, they are either intensifiers (like e.g. "muy" -- "very") or just contingent (dependent on the training collection, e.g. "en el taller" -- "in the garage"; "tarjeta de memoria" -- "storage card"). Those clearly opinionated words are highlighted in &lt;strong&gt;boldface&lt;/strong&gt;. Very few. So for this issue, we can conclude that the training collection is too small.&lt;/p&gt;
&lt;p&gt;If we examine the performance of different classifiers, we can cluster them in three groups: top performers (SVMs, NB), medium performers (PART, J48) and losers for this problem (kNN). These groups are intuitive:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Both SVMs and NB have often demonstrated their high performance in sparse datasets, and in text classification problems in particular. They both build a linear classifier with weights (or probabilities) for each of the features. Linear classifiers perform well here given that the dataset is built on representations that clearly promote over-fitting the dataset, as we have seen that many of the most predictive n-grams are collection-dependent.&lt;/li&gt;
&lt;li&gt;Both PART and J48 (C4.5) are based on reducing error by progressively partitioning the dataset according to tests on the most predictive features. But the predictive features we have for such a small collection are not very good, indeed.&lt;/li&gt;
&lt;li&gt;All versions of kNN perform very bad, most likely because the dataset is sparse and relatively small.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;However, we have to keep in mind that we have used the algorithms with their default configurations. For instance, kNN allows to use the cosine similarity instead of the &lt;a href="http://en.wikipedia.org/wiki/Euclidean_distance" target="_blank"&gt;Euclidean distance&lt;/a&gt; -- being the &lt;a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank"&gt;cosine similarity&lt;/a&gt; much better for text classification problems, as demonstrated many times during 50 years of research in &lt;a href="http://en.wikipedia.org/wiki/Information_retrieval" target="_blank"&gt;Information Retrieval&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;And regarding dataset representations, the behavior is not uniform -- we do not systematically get better results with one representation in comparison with the others. In general, 1-to-3-grams perform better than the other representations in English, while unigrams are best in Spanish, and trigrams is most often the worst representation for both languages. If we focus on top performing classifiers (NB and SVMs), this latter comment is always true. In consequence, trigrams have --to some extent-- demonstrated their power in English (as a complement to uni- and bigrams), but not in Spanish (but knowing that the representation is incorrect because of character encoding).&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Concluding Remarks&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;So all in all, we have a baseline learning-based method for Sentiment Analysis in English (and probably in Spanish, after correcting the representation), which is -- not surprisingly -- based on 1-to-3-grams and Support Vector Machines. And it is a baseline because its performance is relatively poor (with an accuracy of 71%), and we have not taken full advantage of the configuration, text representation and other parameters yet.&lt;/p&gt;
&lt;p&gt;After this long (again!) post, I propose the next steps -- some of them left for the reader as an exercise:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Build a Java class that classifies text files according their sentiment, for English at least, taking my previous post on Language Identification as an example -- left for the reader.&lt;/li&gt;
&lt;li&gt;Test other algorithms, and in particular: play with SVM configuration, and add Boosting (using &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/meta/AdaBoostM1.html" target="_blank"&gt;weka.classifiers.meta.AdaBoostM1&lt;/a&gt;&lt;/code&gt;) to Naive Bayes -- left for the realer.&lt;/li&gt;
&lt;li&gt;Check differences of accuracy in terms of product type -- cars, movies, etc. -- left for the reader.&lt;/li&gt;
&lt;li&gt;Improve the Spanish language representation using the appropriate regex in the tokenizer to cover Spanish letters and accents -- I will take this one myself.&lt;/li&gt;
&lt;li&gt;Check the accuracy of the &lt;a href="http://sentiwordnet.isti.cnr.it/code/SWN3.java" target="_blank"&gt;basic keyword-based algorithm&lt;/a&gt; available in the &lt;a href="http://sentiwordnet.isti.cnr.it/" target="_blank"&gt;SentiWordNet page&lt;/a&gt; -- I will take this one as well.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;So that is all for the moment. You can expect one or more posts from me on this hot topic. Finally, thanks for reading, and please feel free to leave a comment if you think I can improve this article, or you have questions or suggestions for further articles on these topics!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/6VmUlmOr-U4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/4624770148145284411/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=4624770148145284411&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4624770148145284411?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4624770148145284411?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/6VmUlmOr-U4/baseline-sentiment-analysis-with-weka.html" title="Baseline Sentiment Analysis with WEKA" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/06/baseline-sentiment-analysis-with-weka.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0UBQXo5fyp7ImA9WhBaE08.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7669228855049331096</id><published>2013-05-23T18:07:00.001+02:00</published><updated>2013-05-23T18:07:30.427+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-23T18:07:30.427+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Compilation of Resources for Text-based Age Detection</title><content type="html">&lt;p&gt;&lt;em&gt;Text-based age detection&lt;/em&gt; consists of estimate the age of a user according to the kind of texts he/she writes. This task is atracting some attention in the latest years, as for instance it promises to add &lt;em&gt;one of the most interesting demographic features required in ad targetting&lt;/em&gt;. There is even an online application, &lt;a href="http://www.tweetgenie.nl/" target="_blank"&gt;TweetGenie&lt;/a&gt;, which guesses the age of a Twitter user -- it works for Dutch and English.&lt;/p&gt;
&lt;p&gt;Text-based age detection is a text classification task which has close relation with others like genre detection or authorship attribution, as it should be based on stylistic features (e.g. usage of capitalization, average word length, frequencies of prepositions, or even the usage of emoticons) instead of on content bearing words (mostly nouns and verbs) like e.g. in topical text categorization. However, this does not mean that a pure word-based learning would not be effective.&lt;/p&gt;
&lt;p&gt;A particular feature of this task is that &lt;em&gt;it can be approached as classification if ages are divided in ranges, or as regression&lt;/em&gt; if we try to approach the exact age of the user.&lt;/p&gt;
&lt;p&gt;There is a currently ongoing scientific competition at this topic, namely the &lt;a href="http://www.uni-weimar.de/medien/webis/research/events/pan-13/pan13-web/author-profiling.html" target="_blank"&gt;Author Profiling task&lt;/a&gt; at the &lt;a href="http://pan.webis.de/" target="_blank"&gt;9th evaluation lab on uncovering plagiarism, authorship, and social software misuse (PAN 2013)&lt;/a&gt;. With this competition adding up new text collections, we have the following resources for trying and testing our approaches to text-based age detection:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;a href="http://www.uni-weimar.de/medien/webis/research/events/pan-13/pan13-web/author-profiling.html" target="_blank"&gt;PAN 2013 Training Corpus for Author Profiling Task&lt;/a&gt;, consisting of a big number of posts and chats from three age ranges in Spanish and English.&lt;/li&gt;
&lt;li&gt;The &lt;a href="http://u.cs.biu.ac.il/~koppel/BlogCorpus.htm" target="_blank"&gt;Blog Authorship Corpus&lt;/a&gt;, referenced in PAN, consisting of a big number of blog posts from three age ranges in English.&lt;/li&gt;
&lt;li&gt;The &lt;a href="http://faculty.nps.edu/cmartell/NPSChat.htm" target="_blank"&gt;NPS Chat Corpus&lt;/a&gt;, consisting on a relatively small number of chats from five age ranges in English (&lt;a href="http://nltk.org/nltk_data/" target="_blank"&gt;download from the NLTK corpora page&lt;/a&gt; or pay to the LDC).&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;For your comfort, I summarize some statistics about the collections:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/agedetection.corpora.statistics.png" style="WIDTH: 476px; DISPLAY: inline; HEIGHT: 204px" height="204" width="476"/&gt;&lt;/p&gt;
&lt;p&gt;And some notes on the information available in each collection:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/agedetection.corpora.description.png" style="WIDTH: 447px; DISPLAY: inline; HEIGHT: 159px" height="159" width="447"/&gt;&lt;/p&gt;
&lt;p&gt;The following papers can be of interest in order to avoid repeating others work.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). &lt;strong&gt;&lt;a href="http://www.cs.biu.ac.il/~schlerj/schler_springsymp06.pdf" target="_blank"&gt;Effects of Age and Gender on Blogging&lt;/a&gt;&lt;/strong&gt; , Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.&lt;/li&gt;
&lt;li&gt;S. Argamon, M. Koppel, J. Pennebaker and J. Schler (2009), &lt;strong&gt;&lt;a href="http://u.cs.biu.ac.il/~koppel/papers/AuthorshipProfiling-cacm-final.pdf" target="_blank"&gt;Automatically profiling the author of an anonymous text&lt;/a&gt;&lt;/strong&gt; , Communications of the ACM 52 (2): 119-123.&lt;/li&gt;
&lt;li&gt;M.Koppel, S. Argamon and A. Shimoni (2003), &lt;strong&gt;&lt;a href="http://u.cs.biu.ac.il/~koppel/papers/male-female-llc-final.pdf" target="_blank"&gt;Automatically categorizing written texts by author gender&lt;/a&gt;&lt;/strong&gt; , Literary and Linguistic Computing 17(4), November 2002, pp. 401-412.&lt;/li&gt;
&lt;li&gt;Jenny K. Tam (2009). &lt;strong&gt;&lt;a href="https://www.google.es/url?sa=t&amp;amp;rct=j&amp;amp;q=&amp;amp;esrc=s&amp;amp;source=web&amp;amp;cd=1&amp;amp;cad=rja&amp;amp;ved=0CDAQFjAA&amp;amp;url=http://www.dtic.mil/cgi-bin/GetTRDoc?AD=ADA508858&amp;amp;ei=-_6cUYOdFvTT7Ab_GQ&amp;amp;usg=AFQjCNEw2YM65O_lL2kux4yZvNlwhJXosA&amp;amp;sig2=G3u0NRc-5gOWd1O5FkgeTA" target="_blank"&gt;Detecting Age in Online Chat&lt;/a&gt;&lt;/strong&gt; , Master Thesis, Naval Postgraduate School.&lt;/li&gt;
&lt;li&gt;Jane Lin (2007). &lt;strong&gt;&lt;a href="http://www.dtic.mil/cgi-bin/GetTRDoc?AD=ADA467087" target="_blank"&gt;Automatic Author Profiling of Online Chat Logs&lt;/a&gt;&lt;/strong&gt; , Master Thesis, Naval Postgraduate School.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Please feel free to send me a message or comment below if you find any other resource that I should add to this post. Thanks for reading.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/0D0tnmWzVCc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7669228855049331096/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7669228855049331096&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7669228855049331096?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7669228855049331096?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/0D0tnmWzVCc/compilation-of-resources-for-text-based.html" title="Compilation of Resources for Text-based Age Detection" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/05/compilation-of-resources-for-text-based.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0ABSXw6eyp7ImA9WhBaEk4.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-3859956236970326490</id><published>2013-05-22T18:22:00.001+02:00</published><updated>2013-05-22T18:22:38.213+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-22T18:22:38.213+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Protección del menor" /><category scheme="http://www.blogger.com/atom/ns#" term="Control parental" /><category scheme="http://www.blogger.com/atom/ns#" term="Privacidad" /><category scheme="http://www.blogger.com/atom/ns#" term="Smartphone" /><category scheme="http://www.blogger.com/atom/ns#" term="Seguridad" /><title>Presentación: "Menores y móviles: Usos, riesgos y controles parentales"</title><content type="html">&lt;p&gt;El día 19 de abril dí una charla en la Universidad Europea de Madrid, titulada "&lt;strong&gt;Menores y móviles: Usos, riesgos y controles parentales&lt;/strong&gt;". Esta charla se corresponde con un trabajo de investigación que he realizado dentro del proyecto titulado "Protección de usuarios menores de edad de telefonía móvil inteligente", dirigido por &lt;a href="http://joaquinpe.wordpress.com/" target="_blank"&gt;Joaquin Pérez&lt;/a&gt; y financiado por la &lt;a href="http://www.uem.es/" target="_blank"&gt;Universidad Europea de Madrid&lt;/a&gt; (P2012 UEM14).&lt;/p&gt;
&lt;p&gt;El resumen de la charla &lt;a href="http://www.mavir.net/talks/159-gomezhidalgo-abr2013" target="_blank"&gt;está disponible en la página de la red MAVIR&lt;/a&gt; (&lt;a href="http://www.mavir.net/que-es-mavir" target="_blank"&gt;MA2VICMR: Mejorando el Acceso, el Análisis y la Visibilidad de la Información y los Contenidos Multilingüe y Multimedia en Red para la Comunidad de Madrid&lt;/a&gt;), y la presentación utilizada durante la charla es la siguiente:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;iframe src="http://www.slideshare.net/slideshow/embed_code/21686368" height="400" width="476" marginwidth="0" marginheight="0" scrolling="no" frameborder="0"/&gt;&lt;/p&gt;
&lt;p style="TEXT-ALIGN: left"&gt;Si el tema te interesa, no dudes en hacer culaquier pregunta o sugerencia en los comentarios de este post.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/M5bWAuD6jBs" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/3859956236970326490/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=3859956236970326490&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3859956236970326490?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3859956236970326490?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/M5bWAuD6jBs/presentacion-y-moviles-usos-riesgos-y.html" title="Presentación: &amp;quot;Menores y móviles: Usos, riesgos y controles parentales&amp;quot;" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/05/presentacion-y-moviles-usos-riesgos-y.html</feedburner:origLink></entry><entry gd:etag="W/&quot;AkcNSHg6fSp7ImA9WhBaEEo.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-4590226879146293202</id><published>2013-05-20T21:28:00.001+02:00</published><updated>2013-05-20T21:28:19.615+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-20T21:28:19.615+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Opensource" /><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Language Identification as Text Classification with WEKA</title><content type="html">&lt;p&gt;&lt;a href="http://en.wikipedia.org/wiki/Language_identification" target="_blank"&gt;Language Identification&lt;/a&gt;, consisting on guessing the natural language in which a text is written (or an utterance is spoken), is not one of the hardest problems in &lt;a href="http://en.wikipedia.org/wiki/Natural_language_processing"&gt;Natural Language Processing&lt;/a&gt;, and in consequence, I believe &lt;em&gt;it is a good starting point for learning about the text analysis capabilities available in WEKA&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;This is in fact one problem taken by others like in this &lt;a href="http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html" target="_blank"&gt;tutorial on using LingPipe for Language Identification&lt;/a&gt;, or by &lt;a href="http://blog.alejandronolla.com/" target="_blank"&gt;Alejandro Nolla&lt;/a&gt; at his post on &lt;a href="http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/" target="_blank"&gt;Detecting Text Language With Python and NLTK&lt;/a&gt;. Moreover you can find a wide number of language identification programs, APIs and demos in the &lt;a href="http://en.wikipedia.org/wiki/Language_identification" target="_blank"&gt;Wikipedia article on Language Identification&lt;/a&gt;. We may even consider this function as a natural language commodity, as you can see how &lt;a href="http://translate.google.com/" target="_blank"&gt;Google Translate&lt;/a&gt; does it on default in the next figure:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/google.translate.langid.png" style="WIDTH: 400px; DISPLAY: inline; HEIGHT: 159px" height="159" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;The most typical (and rather simple) approach to Language Identification is storing a list of the &lt;em&gt;most frequent character 3-grams&lt;/em&gt; in each language and checking the target overlap with each of the lists. Alternatively, you can use stop words lists. Of course, the accuracy depends on how you compute the overlap, but even simple distances can make it rather effective.&lt;/p&gt;
&lt;p&gt;However, I will not follow this approach here. Instead, I will show how to build an standard text classifier using &lt;a href="http://weka.sourceforge.net/" target="_blank"&gt;WEKA&lt;/a&gt; in order to show the options (and how to apply) the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector&lt;/a&gt;&lt;/code&gt; filter, which is &lt;em&gt;the main tool for text analysis in WEKA&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;The steps we have to follow are the next ones:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;To collect data from different languages in order to build a basic dataset.&lt;/li&gt;
&lt;li&gt;To prepare the data for learning, which involves transforming it by using the &lt;code&gt;StringToWordVector&lt;/code&gt; filter.&lt;/li&gt;
&lt;li&gt;To analyze the resulting dataset, and hopefully, to improve it by using attribute selection.&lt;/li&gt;
&lt;li&gt;To test over an independent test collection, which will give us a robust estimation of the accuracy of the approaches on real examples.&lt;/li&gt;
&lt;li&gt;To learn the most accurate model as obtained from the previous step, and to use it for our classification program.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;So this will be a rather long post. Be prepared for it.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Collecting the data and Creating the Datasets&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Following the &lt;a href="http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html" target="_blank"&gt;LingPipe Language ID Tutorial&lt;/a&gt;, I collect the data from the &lt;a href="http://corpora.uni-leipzig.de/" target="_blank"&gt;Leipzig Corpora Home Page&lt;/a&gt;. In particular, I will address guessing among English (EN), French (FR) and Spanish (SP), so I have gone to &lt;a href="http://corpora.uni-leipzig.de/download.html" target="_blank"&gt;the download page&lt;/a&gt;, completed the CAPTCHA to get the list of available corpora, and downloaded:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;a href="http://corpora.uni-leipzig.de/downloads/eng_news_2005_10K-text.tar.gz" target="_blank"&gt;2005 English 10k corpus of news in text format&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;The &lt;a href="http://corpora.uni-leipzig.de/downloads/fra_news_2009_10K-text.tar.gz" target="_blank"&gt;2009 French 10k corpus of news in text format&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;The 2001-2002 Spanish 10k corpus of news in text format -- which is no longer there as far as I can see.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;For your comfort, I have put these corpora &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;in my LangID GITHub demo page&lt;/a&gt;. The files have the following format:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;1 I didn't know it was police housing," officers quoted Tsuchida as saying.
&lt;br/&gt;
2 You would be a great client for Southern Indiana Homeownership's credit counseling but you are saying to yourself "Oh, we can pay that off."
&lt;br/&gt;
3 He believes the 21st century will be the "century of biology" just as the 20th century was the century of IT.&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;So I have loaded them into an OpenOffice spreadsheet, and replaced the number columns by the corresponding tags for the different languages: &lt;code&gt;EN&lt;/code&gt;, &lt;code&gt;FR&lt;/code&gt;, and &lt;code&gt;SP&lt;/code&gt;. Then I have escaped the &lt;code&gt;"&lt;/code&gt; and &lt;code&gt;'&lt;/code&gt; characters, because they are string delimiters in WEKA &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html" target="_blank"&gt;Attribute-Relation File Format&lt;/a&gt; (ARFF). In order to build the datasets, I have split the data keeping the first 9K sentences of each language for training, and the remaining 1K for testing. As some learning algorithms may be sensitive to the instance order, I have mixed the instances in batches of 1K texts, so the first 1K sentences are in English, the next 1K sentences are in French, and so on. The training data has the following header:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;@relation langid_train
&lt;br/&gt;
&lt;br/&gt;
@attribute language_class {EN,FR,SP}
&lt;br/&gt;
@attribute text String
&lt;br/&gt;
&lt;br/&gt;
@data
&lt;br/&gt;
EN,'I didn\'t know it was police housing,\" officers quoted Tsuchida as saying.'
&lt;br/&gt;
EN,'You would be a great client for Southern Indiana Homeownership\'s credit counseling but you are saying to yourself \"Oh, we can pay that off.\"'
&lt;br/&gt;
EN,'He believes the 21st century will be the \"century of biology\" just as the 20th century was the century of IT.'
&lt;br/&gt;
../..&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The ARFF files for training and testing are available at the &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;GITHub repository for the demo&lt;/a&gt; as well. You can open the training file (&lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/langid.collection.train.arff" target="_blank"&gt;langid.collection.train.arff&lt;/a&gt;&lt;/code&gt;) in the WEKA Explorer, and setting the class to be the first attribute, you should be getting something like the following figure:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/explorer.training.langid.png" style="WIDTH: 450px; DISPLAY: inline; HEIGHT: 336px" height="336" width="450"/&gt;&lt;/p&gt;
&lt;p&gt;So we have a training collection with 9K instances per class (language), and a test collection with 1K instances per class.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Data Transformation&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;As &lt;a href="http://jmgomezhidalgo.blogspot.com/search/label/WEKA" target="_blank"&gt;in previous posts about text classification with WEKA&lt;/a&gt;, we need to transform the text strings into term vector to enable learning. This is done by applying the &lt;code&gt;StringToWordVector&lt;/code&gt; filter, that is the most remarkable text mining function in WEKA. In previous posts, I have applied this filter with default options, but it offers a wide range of possibilities that can be seen when opening it in the WEKA Explorer. If you click on the &lt;em&gt;Filter&lt;/em&gt; button and browse the tree to "&lt;em&gt;weka &amp;gt; filters &amp;gt; unsupervised &amp;gt; attribute &amp;gt; StringToWordVector&lt;/em&gt;", and then click on the filter name, you get the next window:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/explorer.stringtowordvector.png" style="WIDTH: 440px; DISPLAY: inline; HEIGHT: 623px" height="623" width="440"/&gt;&lt;/p&gt;
&lt;p&gt;Those are a lot of options, aren't them? So let us focus on the minimum set of options in order to be productive with this example of Language Identification. Those are:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;doNoOperateOnPerClassBasis&lt;/code&gt; - we set this option to &lt;code&gt;True&lt;/code&gt; in order to make the filter collect word tokens over the classes as a whole. This should be the standard setting in nearly all text classification problems.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;lowerCaseTokens&lt;/code&gt; - we set this option to &lt;code&gt;True&lt;/code&gt; because we are interested on the words independently of using upper or lower case. In other problems, like e.g. when processing Social Networks text, keeping the capitalization may be critical for getting a good accuracy.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;tokenizer&lt;/code&gt; - WEKA provides several tokenizers, intended to break the original texts into tokes according to a number of rules. The most simple tokenizer is the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/tokenizers/WordTokenizer.html" target="_blank"&gt;weka.core.tokenizers.WordTokenizer&lt;/a&gt;&lt;/code&gt;, which splits the string into tokens by using a list of separators that can be set by clicking on the tokenizer name. It is a nice idea to give a look at the texts we have before setting up the list of separating characters. In our case, we have several languages and the default punctuation symbols may not fit our problem -- we need to add opening question and exclamation marks, apart from other symbols from HTML format like &amp;amp;, and other symbols. So our delimiters string will be " \r\n\t.,;:\"\'()?!-¿¡+*&amp;amp;#$%\\/=&amp;lt;&amp;gt;[]_`@" (backslash is escaped).&lt;/li&gt;
&lt;li&gt;wordsToKeep - we set this option to keep as much words as we can, to include the full vocabulary of the dataset. An appropriate value may be one million.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;So we leave the rest of options on default. Most notably, we are not using &lt;a href="http://en.wikipedia.org/wiki/Tf–idf" target="_blank"&gt;sophisticated weighting schemas (like TF or TF.IDF)&lt;/a&gt;, nor &lt;a href="http://en.wikipedia.org/wiki/Stop_words" target="_blank"&gt;stop words&lt;/a&gt; or &lt;a href="http://en.wikipedia.org/wiki/Stemming" target="_blank"&gt;stemming&lt;/a&gt;. These options are very frequent in &lt;a href="http://en.wikipedia.org/wiki/Information_retrieval" target="_blank"&gt;Information Retrieval&lt;/a&gt; systems like &lt;a href="http://lucene.apache.org/solr/" target="_blank"&gt;Apache Lucene/SOLR&lt;/a&gt;, and they often lead to nice accuracy improvements in search systems.&lt;/p&gt;
&lt;p&gt;We need to have the same vocabulary both in the training and the testing datasets, so we can apply this filter in the command line by using the batch (&lt;code&gt;-b&lt;/code&gt;) option:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -O -L -tokenizer "weka.core.tokenizers.WordTokenizer -delimiters \" \\r\\n\\t.,;:\\\"\\'()?!-¿¡+*&amp;amp;#$%\\\\/=&amp;lt;&amp;gt;[]_`@\"" -W 10000000 -b -i langid.collection.train.arff -o langid.collection.train.vector.arff -r langid.collection.test.arff -s langid.collection.test.vector.arff&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The options -O, -L, -tokenizer and -W correspond to the options above. The delimiter string is escaped because it is included in the specification of the tokenizer. The resulting files are also &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;in the GITHub repository for the LangID example&lt;/a&gt;, along with the script &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/stwv.sh" target="_blank"&gt;stwv.sh&lt;/a&gt;&lt;/code&gt; (String To Word Vector) which includes this command.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Data Analysis and Improvement&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;If we take a quick look to the terms or tokens we have got, e.g.:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;@attribute archival numeric
&lt;br/&gt;
@attribute archivarlos numeric
&lt;br/&gt;
@attribute archivas numeric
&lt;br/&gt;
@attribute archives numeric
&lt;br/&gt;
@attribute archiving numeric
&lt;br/&gt;
@attribute archivo numeric
&lt;br/&gt;
@attribute archivos numeric&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;We can imagine that most of them will be useless for Language Identification. This motivates making a more precise analysis of the tokens by using some kind of quality metric, like &lt;a href="http://en.wikipedia.org/wiki/Information_gain_in_decision_trees" target="_blank"&gt;Information Gain&lt;/a&gt;. In fact, I am applying the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;weka.filters.supervised.attribute.AttributeSelection&lt;/a&gt;&lt;/code&gt; filter as I did in my posts on &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html" target="_blank"&gt;selecting attributes by chaining filters&lt;/a&gt; and on &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/04/command-line-functions-for-text-mining.html" target="_blank"&gt;command line functions for text mining&lt;/a&gt;. So I issue the following command:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.supervised.attribute.AttributeSelection -c 1 -E weka.attributeSelection.InfoGainAttributeEval -S "weka.attributeSelection.Ranker -T 0.0" -b -i langid.collection.train.vector.arff -o langid.collection.train.vector.ig0.arff -r langid.collection.test.vector.arff -s langid.collection.test.vector.ig0.arff&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;We apply the filter in batch mode as well, in order to get the same attributes both in the training and in the test collections. We also set up the first attribute as the class (with the option &lt;code&gt;-c&lt;/code&gt;), and set the threshold for keeping attributes as &lt;code&gt;0.0&lt;/code&gt; in the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/attributeSelection/Ranker.html" target="_blank"&gt;weka.attributeSelection.Ranker&lt;/a&gt;&lt;/code&gt; search method. This means that we will keep only those attributes with Information Gain score over 0, and they will be sorted according to their score as well. This command is included in the &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/asig.sh" target="_blank"&gt;asig.sh&lt;/a&gt;&lt;/code&gt; (Attribute Selection by Information Gain) script of &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;the GITHub repository for the LangID example&lt;/a&gt;, along with the data files.&lt;/p&gt;
&lt;p&gt;From the original 65,429 word attributes we got in the previous step, we have kept only 16,840 (a 25.73% of the original ones). We can be more aggressive by setting the threshold to a bigger value (e.g. 0.2).&lt;/p&gt;
&lt;p&gt;The first twenty attributes are the next ones:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/forty.top.ig.terms.langid.png" style="WIDTH: 300px; DISPLAY: inline; HEIGHT: 163px" height="163" width="300"/&gt;&lt;/p&gt;
&lt;p&gt;As we can see, all of them are very frequent words (in each language) that would be present in the stop lists for them. In consequence, our "pure" data mining approach is quite close to the traditional one based on stop words.&lt;/p&gt;
&lt;p&gt;It makes sense to learn a J48 tree to get an idea of the complexity of the term relations. The &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/trees/J48.html" target="_blank"&gt;weka.classifiers.trees.J48&lt;/a&gt;&lt;/code&gt; algorithm implements the &lt;a href="http://en.wikipedia.org/wiki/C4.5_algorithm" target="_blank"&gt;Quinlan's popular C4.5 learner&lt;/a&gt;, and as it outputs a decision tree, it can give us valuable insights of the term relations, like e.g. which co-occurring terms are more predictive. If we train that classifier on our new training dataset with the following command:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.classifiers.trees.J48 -t langid.collection.train.vector.ig0.arff -no-cv&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;However, we get a quite complex decision tree populated with 273 nodes and 137 leaves. All the tests in the tree have the following look: "&lt;code&gt;word &amp;gt; 0&lt;/code&gt;" or "&lt;code&gt;word &amp;lt;= 0&lt;/code&gt;". This means that the algorithm induces that only the occurrence of words is important, but not its weight. The root of the tree is obviously a test on "&lt;code&gt;the&lt;/code&gt;", and the smallest side of the tree (its right hand side, with "&lt;code&gt;the &amp;gt; 0&lt;/code&gt;") is the following one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;the &amp;gt; 0
&lt;br/&gt;
| de &amp;lt;= 0: EN (5945.0/8.0)
&lt;br/&gt;
| de &amp;gt; 0
&lt;br/&gt;
| | el &amp;lt;= 0
&lt;br/&gt;
| | | and &amp;lt;= 0
&lt;br/&gt;
| | | | for &amp;lt;= 0
&lt;br/&gt;
| | | | | to &amp;lt;= 0: FR (24.0/3.0)
&lt;br/&gt;
| | | | | to &amp;gt; 0: EN (2.0)
&lt;br/&gt;
| | | | for &amp;gt; 0: EN (3.0)
&lt;br/&gt;
| | | and &amp;gt; 0: EN (7.0)
&lt;br/&gt;
| | el &amp;gt; 0: SP (3.0)&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;This means, for instance, that the word "&lt;code&gt;the&lt;/code&gt;" is an excellent predictive feature, and if it occurs in a text and the word "&lt;code&gt;de&lt;/code&gt;" (from French or Spanish) does not occur in the text, that text is most likely written in English (with an estimated likelihood of 99.86% on the training collection). The overall accuracy of J48 over the training collection is 98.3963%.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Training and then Evaluating on the Test Collection&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Before start training and evaluating, we have to decide which algorithms are most appropriate for the problem. In my experience with text learning, it is wise to test at least the following ones:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;em&gt;Naive Bayes&lt;/em&gt; probabilistic approach, quick and with good results in text learning on average problems. In WEKA, It is incarnated in the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/bayes/NaiveBayes.html" target="_blank"&gt;weka.classifiers.bayes.NaiveBayes&lt;/a&gt;&lt;/code&gt; class.&lt;/li&gt;
&lt;li&gt;The &lt;em&gt;rule learner PART&lt;/em&gt;, which induces a list of rules by learning partial decision trees. It is a symbolic algorithm that produces rules which can be very valuable as they are easy to understand. This algorithm is implemented by the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/rules/PART.html" target="_blank"&gt;weka.classifiers.rules.PART&lt;/a&gt;&lt;/code&gt; class.&lt;/li&gt;
&lt;li&gt;Of course, the J48 algorithm because of its visualization capabilities.&lt;/li&gt;
&lt;li&gt;The lazy learner &lt;em&gt;k-Nearest Neighbors (kNN)&lt;/em&gt;, which occasionally gives excellent results in text classification problems. The WEKA class that implements this algorithm is &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/lazy/IBk.html" target="_blank"&gt;weka.classifiers.lazy.IBk&lt;/a&gt;&lt;/code&gt;.&lt;/li&gt;
&lt;li&gt;The &lt;em&gt;Support Vector Machines&lt;/em&gt; algorithm, which it is probably the most effective on text classification problems because of its ability to focus on the most relevant examples in order to separate the classes. It is a very good learning algorithm for sparse datasets, and it is implemented in WEKA via the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/functions/SMO.html" target="_blank"&gt;weka.classifiers.functions.SMO&lt;/a&gt;&lt;/code&gt; class or by the library &lt;a href="http://weka.wikispaces.com/LibSVM" target="_blank"&gt;LibSVM&lt;/a&gt;. I choose the Sequential Minimum Optimization implementation (SMO) embedded in WEKA.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Also, when Naive Bayes or J48 are effective, I usually get from small to even big accuracy improvements by using &lt;a href="http://en.wikipedia.org/wiki/Boosting_(machine_learning)" target="_blank"&gt;boosting&lt;/a&gt;, implemented by the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/meta/AdaBoostM1.html" target="_blank"&gt;weka.classifiers.meta.AdaBoostM1&lt;/a&gt;&lt;/code&gt; class in WEKA. Boosting takes as input a weak classifier, and build a classifier committee by iteratively training that weak learner on those dataset subsets on which the previous learners are not effective. In this case, I will not apply boosting because the weak learners get rather high levels of accuracy, and it is most likely that boosting will only achieve a marginal improvement (if any) at the cost of a much bigger training time.&lt;/p&gt;
&lt;p&gt;I have written an script named &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/test.sh" target="_blank"&gt;test.sh&lt;/a&gt;&lt;/code&gt; to execute all these algorithms with default options at the &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;GITHub repository for the LangID demo&lt;/a&gt;. The results obtained by the algorithms are included in the repository as well, and summarized in the next table:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/results.test.langid.png" style="WIDTH: 230px; DISPLAY: inline; HEIGHT: 136px" height="136" width="230"/&gt;&lt;/p&gt;
&lt;p&gt;The different versions of the lazy algorithm kNN tested here appear to be very weak. It is likely we can improve its performance by changing the way the distance among examples is computed (from the Euclidean distance to a more appropriate one for text, that would be the cosine similarity), but their performance is so low that they will not score better than the rest of the algorithms.&lt;/p&gt;
&lt;p&gt;The top algorithms in this test are &lt;em&gt;Naive Bayes&lt;/em&gt; and &lt;em&gt;Support Vector Machines&lt;/em&gt;. There is a trade off between both algorithms: SVMs are more effective (in fact, they are very effective) but they employ quite a lot of time to be trained, while Naive Bayes is less effective but quicker to be trained. In terms of classification time, both algorithms are linear on the number of attributes.&lt;/p&gt;
&lt;p&gt;Even we have used a big number of attributes, there are some examples with rather weak representations. For instance, let us check the following instances or texts:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;{58 1,94 1,313 1,1663 1}
&lt;br/&gt;
{119 1,361 1,2644 1,16840 FR}
&lt;br/&gt;
{2 1,16840 SP}&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The first and second examples have only 3 occurring words (the class value for the first text is &lt;code&gt;EN&lt;/code&gt; in the sparse format it is used by WEKA in this example), and the third example has only one word ("&lt;code&gt;el&lt;/code&gt;"). The two first examples attribute numbers (58 or over) mean that the attributes are not the most informative ones, while in the third example we find a very informative word. If we apply a more aggressive selection using Information Gain, we will be missing a lot of examples (with null representations) in this example, thus making them fall to the most likely class. As the classes have a balanced distribution, the language chosen in that case will be &lt;code&gt;EN&lt;/code&gt;, which is the default value for the class attribute.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Learning the Best Classifier and Using it Programmatically&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;So after our experiments, we know the best classifier in our tests is SVMs. So it is time to learn it and store the classifier into a file for further programmatic use. For this purpose, I have written an script that trains the classifier and stores the model into a file, using the following command-line call:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.classifiers.meta.FilteredClassifier -t langid.collection.train.arff -c first -no-cv -d smo.model.dat -v -F "weka.filters.MultiFilter -F \"weka.filters.unsupervised.attribute.StringToWordVector -O -L -tokenizer &lt;a&gt;\\\"weka.core.tokenizers.WordTokenizer&lt;/a&gt; -delimiters &lt;a&gt;\\\\\\\&lt;/a&gt;" &lt;a&gt;\\\\\\\r\\\\\\\n\\\\\\\t.,;:\\\\\\\\\\\\\\\"'()?!-¿¡+*&amp;amp;#$%/=&amp;lt;&amp;gt;[]_`@\\\\\\\"\\\&lt;/a&gt;" -W 10000000\" -F \"weka.filters.supervised.attribute.AttributeSelection -E weka.attributeSelection.InfoGainAttributeEval -S &lt;a&gt;\\\"weka.attributeSelection.Ranker&lt;/a&gt; -T 0.0\\\"\"" -W weka.classifiers.functions.SMO&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;This call is rather painful because of the nested, and nested, and nested, and nested quotes. So I have pretty-printed it in the script &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/learn.sh" target="_blank"&gt;learn.sh&lt;/a&gt;&lt;/code&gt; script at the &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;GitHub repository for the LangID example&lt;/a&gt;. For dealing with nested quotes, follow the advice in &lt;a href="http://en.wikipedia.org/wiki/Nested_quotation" target="_blank"&gt;the Wikipedia article about nested quotation&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;With this call, we have stored a model in the file &lt;code&gt;smo.model.dat&lt;/code&gt;, which chains the &lt;code&gt;StringToWordVector&lt;/code&gt; filter, the &lt;code&gt;AttributeSelection&lt;/code&gt; filter, and an &lt;code&gt;SMO&lt;/code&gt; classifier by using the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;weka.classifiers.meta.FilteredClassifier&lt;/a&gt;&lt;/code&gt; and the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/MultiFilter.html" target="_blank"&gt;weka.filters.MultiFilter&lt;/a&gt;&lt;/code&gt; classes, as I have explained in the post on &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/04/command-line-functions-for-text-mining.html" target="_blank"&gt;Command Line Functions for Text Mining in WEKA&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;One good point of WEKA is that we can learn a model in the command line and use it in a program. I have modified the &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/FilteredClassifier/MyFilteredClassifier.java" target="_blank"&gt;MyFilteredClassifier.java&lt;/a&gt;&lt;/code&gt; program I used in my post describing &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html" target="_blank"&gt;A Simple Text Classifier in Java with WEKA&lt;/a&gt;, and I have committed it at the &lt;a href="https://github.com/jmgomezh/tmweka/tree/master/LangID" target="_blank"&gt;GITHub repository&lt;/a&gt; with the name &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/LanguageIdentifier.java" target="_blank"&gt;LanguageIdentifier.java&lt;/a&gt;&lt;/code&gt;. I have created three sample test files as well, &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/test_en.txt" target="_blank"&gt;test_en.txt&lt;/a&gt;&lt;/code&gt;, &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/test_fr.txt" target="_blank"&gt;test_fr.txt&lt;/a&gt;&lt;/code&gt; and &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/LangID/test_sp.txt" target="_blank"&gt;test_sp.txt&lt;/a&gt;&lt;/code&gt;. The operation of the program is the following one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; javac LanguageIdentifier.java
&lt;br/&gt;
&lt;br/&gt;
$&amp;gt; java LanguageIdentifier
&lt;br/&gt;
Usage: java LanguageIdentifier &amp;lt;fileData&amp;gt; &amp;lt;fileModel&amp;gt;
&lt;br/&gt;
$&amp;gt; java LanguageIdentifier test_en.txt smo.model.dat
&lt;br/&gt;
===== Loaded text data: test_en.txt =====
&lt;br/&gt;
This is a sample test for the language identifier demo.
&lt;br/&gt;
===== Loaded model: smo.model.dat =====
&lt;br/&gt;
===== Instance created with reference dataset =====
&lt;br/&gt;
@relation 'Test relation'
&lt;br/&gt;
@attribute language_class {EN,FR,SP}
&lt;br/&gt;
@attribute text string
&lt;br/&gt;
@data
&lt;br/&gt;
?,' This is a sample test for the language identifier demo.'
&lt;br/&gt;
===== Classified instance =====
&lt;br/&gt;
Class predicted: EN
&lt;br/&gt;
&lt;br/&gt;
$&amp;gt; java LanguageIdentifier test_fr.txt smo.model.dat
&lt;br/&gt;
===== Loaded text data: test_fr.txt =====
&lt;br/&gt;
Ceci est un test de l'échantillon pour la démonstration de l'identificateur de langue.
&lt;br/&gt;
===== Loaded model: smo.model.dat =====
&lt;br/&gt;
===== Instance created with reference dataset =====
&lt;br/&gt;
@relation 'Test relation'
&lt;br/&gt;
@attribute language_class {EN,FR,SP}
&lt;br/&gt;
@attribute text string
&lt;br/&gt;
@data
&lt;br/&gt;
?,' Ceci est un test de l'échantillon pour la démonstration de l'identificateur de langue.'
&lt;br/&gt;
===== Classified instance =====
&lt;br/&gt;
Class predicted: FR
&lt;br/&gt;
&lt;br/&gt;
$&amp;gt; java LanguageIdentifier test_sp.txt smo.model.dat
&lt;br/&gt;
===== Loaded text data: test_sp.txt =====
&lt;br/&gt;
Esto es un texto de prueba para la demostración del identificador de idioma.
&lt;br/&gt;
===== Loaded model: smo.model.dat =====
&lt;br/&gt;
===== Instance created with reference dataset =====
&lt;br/&gt;
@relation 'Test relation'
&lt;br/&gt;
@attribute language_class {EN,FR,SP}
&lt;br/&gt;
@attribute text string
&lt;br/&gt;
@data
&lt;br/&gt;
?,' Esto es un texto de prueba para la demostración del identificador de idioma.'
&lt;br/&gt;
===== Classified instance =====
&lt;br/&gt;
Class predicted: SP&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;So the program is correct on the three examples. Remember that you have to learn the model before using the program. As a side note, as the program only uses a &lt;code&gt;FilteredClassifier&lt;/code&gt; object, you can change the script to accommodate a different algorithm. For instance, you can just change the text "&lt;code&gt;weka.classifiers.functions.SMO&lt;/code&gt;" by "&lt;code&gt;weka.classifiers.bayes.NaiveBayes&lt;/code&gt;" in the &lt;code&gt;learn.sh&lt;/code&gt; script, and the program will be working the same way -- but with a different model.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Concluding Remarks&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;While being relatively simple, the Language Identification problem helps to identify the essential tasks we have to perform when building text classifiers with WEKA. It is a complete example in the sense that we have not only collected the dataset and learnt on it, but we have also dig a bit into the most suitable representation by playing with attribute selection and tentative classifier to visualize the data. It also demonstrates some basic configurations of the &lt;code&gt;StringToWordVector&lt;/code&gt; filter, which is the most remarkable tool in WEKA for text mining.&lt;/p&gt;
&lt;p&gt;If you have had the time to read all this post, and even tried the program: thank you! I hope it has been a valuable time investment. I am tempted to suggest you to modify the dataset to include more languages, as the problem I have addressed is relatively simple -- only three and quite different languages.&lt;/p&gt;
&lt;p&gt;Finally, thanks for reading, and please feel free to leave a comment if you think I can improve this article, or you have questions or suggestions for further articles on this topics!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/muECx_puuGk" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/4590226879146293202/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=4590226879146293202&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4590226879146293202?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4590226879146293202?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/muECx_puuGk/language-identification-as-text.html" title="Language Identification as Text Classification with WEKA" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/05/language-identification-as-text.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkIHSXo_eCp7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-1659637270885805661</id><published>2013-05-02T01:41:00.001+02:00</published><updated>2013-05-02T09:42:18.440+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:42:18.440+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Mapping Vocabulary from Train to Test Datasets in WEKA Text Classifiers</title><content type="html">&lt;p&gt;There are several ways of evaluating a (text) classifier: &lt;a href="http://en.wikipedia.org/wiki/Cross-validation_(statistics)" target="_blank"&gt;cross validation&lt;/a&gt;, splitting your dataset into train and test subsets, or even evaluating the classifier on the training set itself (not recommended). I will not discuss the merits of each method, instead I will focus on a train/test split evaluation.&lt;/p&gt;
&lt;p&gt;When you start to work with your train and test text datasets, you have got two labelled text collections like e.g. those I make available at &lt;a href="https://github.com/jmgomezh/tmweka" target="_blank"&gt;my GITHub project&lt;/a&gt;: &lt;a href="https://github.com/jmgomezh/tmweka/blob/master/InputMappedClassifier/smsspam.small.train.arff" target="_blank"&gt;&lt;code&gt;smsspam.small.train.arff&lt;/code&gt;&lt;/a&gt; and &lt;a href="https://github.com/jmgomezh/tmweka/blob/master/InputMappedClassifier/smsspam.small.test.arff" target="_blank"&gt;&lt;code&gt;smsspam.small.test.arff&lt;/code&gt;&lt;/a&gt; . In this case, we have two collections that are a 50% split of my original simple collection &lt;a href="https://github.com/jmgomezh/tmweka/blob/master/FilteredClassifier/smsspam.small.arff" target="_blank"&gt;&lt;code&gt;smsspam.small.arff&lt;/code&gt;&lt;/a&gt; , which in turn is a subset of the the original &lt;a href="http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/" target="_blank"&gt;SMS Spam Collection&lt;/a&gt;. The files are formatted according to the &lt;a href="http://weka.sourceforge.net/" target="_blank"&gt;WEKA&lt;/a&gt; &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html" target="_blank"&gt;ARFF&lt;/a&gt;:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;@relation sms_test
&lt;br/&gt;
&lt;br/&gt;
@attribute spamclass {spam,ham}
&lt;br/&gt;
@attribute text String
&lt;br/&gt;
&lt;br/&gt;
@data
&lt;br/&gt;
ham,'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
&lt;br/&gt;
spam,'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;amp;C\'s apply 08452810075over18\'s'
&lt;br/&gt;
...&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;That is, one text instance per line, the first attribute being the nominal class spam/ham, and the second attribute being the text itself.&lt;/p&gt;
&lt;p&gt;In text classification, you have to transform this original representation into a vector of terms/words/stems/etc. in order to allow the classifier to learn expressions like: "if the word "win" occurs in a text, then classify it as spam". In other words, you have to represent your texts as feature vectors, where the features are words and the values are e.g. binary weights, &lt;a href="http://en.wikipedia.org/wiki/Tf–idf" target="_blank"&gt;TF weights, or TF.IDF weights&lt;/a&gt;. In fact, WEKA provides the handy &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;&lt;code&gt;StringToWordVector&lt;/code&gt;&lt;/a&gt; filter for this purpose (Thanks, WEKA!).&lt;/p&gt;
&lt;p&gt;However, it is most likely that the vocabulary used in your training set and in your test set is not identical. For instance, if you directly apply the &lt;code&gt;StringToWordVector&lt;/code&gt; filter to the previous files, you get a bit different results, summarized in the following table:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/table.train.test.attributes.png" style="DISPLAY: inline" height="185" width="273"/&gt;&lt;/p&gt;
&lt;p&gt;Obviously, to enable learning you have to ensure that the representation of both datasets is the same. For instance, imagine that the root of the decision tree you have learnt on your training collection poses a test on an attribute that does not exist on your test collection, then what happens?&lt;/p&gt;
&lt;p&gt;Fortunately, WEKA provides at least three ways of getting the same vocabulary in your train and test subcollections. Here are them:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Using a &lt;strong&gt;batch filter&lt;/strong&gt; that takes both training and test collections at the same time, using the first for getting the attributes and representing the last using those attributes.&lt;/li&gt;
&lt;li&gt;Using a &lt;strong&gt;&lt;code&gt;&lt;strong&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;&lt;strong&gt;&lt;code&gt;&lt;strong&gt;FilteredClasifier&lt;/strong&gt;&lt;/code&gt;&lt;/strong&gt;&lt;/a&gt;&lt;/strong&gt;&lt;/code&gt;&lt;/strong&gt; (that I have discussed &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;in previous posts&lt;/a&gt;), which feeds both the filter and the classifier into a single classifier that takes the original representation class/text as input for both the training and the test sets.&lt;/li&gt;
&lt;li&gt;A more recent method, that is separately getting the representations and using an &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/misc/InputMappedClassifier.html" target="_blank"&gt;&lt;strong&gt;InputMappedClassifier&lt;/strong&gt;&lt;/a&gt;&lt;/code&gt; that acts as a wrapper of an underlying classifier, and tries to match attributes from the training collection into the corresponding ones of the test subset.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;The first method is quite simple, and it just makes use of the &lt;code&gt;-b&lt;/code&gt; option of the WEKA filters. The corresponding command line calls are the next ones:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -b -i smsspam.small.train.arff -o smsspam.small.train.vector.arff -r smsspam.small.test.arff -s smsspam.small.test.vector.arff
&lt;br/&gt;
$&amp;gt; java weka.classifiers.lazy.IBk -t smsspam.small.train.vector.arff -T smsspam.small.test.vector.arff -i -c first
&lt;br/&gt;
...
&lt;br/&gt;
=== Confusion Matrix ===
&lt;br/&gt;
a b &amp;lt;-- classified as
&lt;br/&gt;
1 15 | a = spam
&lt;br/&gt;
0 84 | b = ham&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The second method, conveniently discussed &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;in my previous post&lt;/a&gt;, can be applied with the following call:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.classifiers.meta.FilteredClassifier -t smsspam.small.train.arff -T smsspam.small.test.arff -F weka.filters.unsupervised.attribute.StringToWordVector -W weka.classifiers.lazy.IBk -i -c first
&lt;br/&gt;
...
&lt;br/&gt;
=== Confusion Matrix ===
&lt;br/&gt;
a b &amp;lt;-- classified as
&lt;br/&gt;
1 15 | a = spam
&lt;br/&gt;
0 84 | b = ham&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;As it is shown in the previous results, both methods achieve the same results. In this case, I have opted for using &lt;code&gt;StringToWordVector&lt;/code&gt; without parameters (default tokenization, term weights, no stemming, etc.) with the relatively weak classifier &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/lazy/IBk.html" target="_blank"&gt;IBk&lt;/a&gt;&lt;/code&gt; , which implements a k-Nearest-Neighbor learner that, instead of building a model from the training collection, it searches the closest training instance to the test instance (&lt;code&gt;k&lt;/code&gt; is 1 on default) and assigns its class to the test instance.&lt;/p&gt;
&lt;p&gt;However, the third method achieves different results, as the mapping involves some attributes from the training collection disappearing, and ignoring new attributes in the test collection. It is called the following way:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -i smsspam.small.train.arff -o smsspam.small.train.vector.arff
&lt;br/&gt;
$&amp;gt; java weka.filters.unsupervised.attribute.StringToWordVector -i smsspam.small.test.arff -o smsspam.small.test.vector.arff
&lt;br/&gt;
$&amp;gt; java weka.classifiers.misc.InputMappedClassifier -W weka.classifiers.lazy.IBk -t smsspam.small.train.vector.arff -T smsspam.small.test.vector.arff -i -c first
&lt;br/&gt;
Attribute mappings:
&lt;br/&gt;
Model attributes Incoming attributes
&lt;br/&gt;
------------------------------ ----------------
&lt;br/&gt;
(nominal) spamclass --&amp;gt; 1 (nominal) spamclass
&lt;br/&gt;
(numeric) #&amp;amp;gt --&amp;gt; 2 (numeric) #&amp;amp;gt
&lt;br/&gt;
(numeric) $1 --&amp;gt; - missing (no match)
&lt;br/&gt;
(numeric) &amp;amp;amp --&amp;gt; - missing (no match)
&lt;br/&gt;
(numeric) &amp;amp;lt --&amp;gt; 6 (numeric) &amp;amp;lt
&lt;br/&gt;
(numeric) *9 --&amp;gt; 7 (numeric) *9
&lt;br/&gt;
(numeric) + --&amp;gt; - missing (no match)
&lt;br/&gt;
(numeric) - --&amp;gt; 8 (numeric) -
&lt;br/&gt;
...
&lt;br/&gt;
=== Confusion Matrix ===
&lt;br/&gt;
a b &amp;lt;-- classified as
&lt;br/&gt;
2 14 | a = spam
&lt;br/&gt;
1 83 | b = ham&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p style="MARGIN-RIGHT: 0px"&gt;In fact, this time we get a bit more spam (2 over 14) with a false positive, although the general accuracy is exactly the same: 85%. You can see how some of the attributes are missing (they do not occur in the test dataset), like: "&lt;code&gt;$1&lt;/code&gt;", "&lt;code&gt;+&lt;/code&gt;", etc. This for sure affects the performance of the classifier, so beware.&lt;/p&gt;
&lt;p&gt;With these options, my recommendation is using the first method, as it allows you to fully examine the representation of the datasets (term weight vectors) and it decouples filtering from training, what may be convenient in terms of efficiency.&lt;/p&gt;
&lt;p&gt;Before ending this post, I have to thank Tiago Pasqualini Silva, &lt;a href="http://www.dt.fee.unicamp.br/~tiago/index.html" target="_blank"&gt;Tiago Almeida&lt;/a&gt; and &lt;a href="http://paginaspersonales.deusto.es/isantos/en/about.shtml" target="_blank"&gt;Igor Santos&lt;/a&gt; for our experiments with the SMS Spam Collection, and to Tiago Pasqualini in particular because he showed me the &lt;code&gt;InputMappedClassifier&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;And last but not least, thanks for reading, and please feel free to leave a comment if you think I can improve this article, or you have questions or suggestions for further articles on this topics!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/QZr69m6tzfs" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/1659637270885805661/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=1659637270885805661&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/1659637270885805661?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/1659637270885805661?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/QZr69m6tzfs/mapping-vocabulary-from-train-to-test.html" title="Mapping Vocabulary from Train to Test Datasets in WEKA Text Classifiers" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/05/mapping-vocabulary-from-train-to-test.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEYAQ3k_eip7ImA9WhBaE0Q.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7228929015387305231</id><published>2013-04-26T00:46:00.001+02:00</published><updated>2013-05-24T12:42:22.742+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-24T12:42:22.742+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Internet" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Filtering" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>URL Text Classification with WEKA, Part 1: Data Analysis</title><content type="html">&lt;p&gt;I have recently came across a website named &lt;a href="http://squidblacklist.org/" target="_blank"&gt;SquidBlackList.org&lt;/a&gt;, which features a number or URL lists for safe web browsing using the open source proxy &lt;a href="http://www.squid-cache.org/" target="_blank"&gt;Squid&lt;/a&gt;. In particular, it features a &lt;a href="http://squidblacklist.org/downloads.html" target="_blank"&gt;quite big porn domains list&lt;/a&gt;, so I wondered: &lt;strong&gt;Is it possible to make a text classification system with &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/" target="_blank"&gt;WEKA&lt;/a&gt; to detect porn domains using the text in the URLs?&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Just to note that SquidBlackList on porn (and most of the rest of the lists they provide is licensed under Creative Commons Attribution 3.0 Unported License: &lt;span&gt;Blacklists&lt;/span&gt; (&lt;a href="http://www.squidblacklist.org" rel="cc:attributionURL"&gt;Squidblacklist.org&lt;/a&gt;) / &lt;a href="http://creativecommons.org/licenses/by/3.0/" rel="license"&gt;CC BY 3.0&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;The Filtering Problem&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;Most &lt;a href="http://en.wikipedia.org/wiki/Content-control_software" target="_blank"&gt;web filtering systems&lt;/a&gt; work by using a manually classified list of URLs into a list of categories that are used to define filtering profiles (e.g. block &lt;em&gt;porn&lt;/em&gt; but allow &lt;em&gt;press&lt;/em&gt;). The URL lists or database must be manually maintained, and it has to be quite comprehensive regarding user browsing behaviour. As (aggregated) web browsing follows a &lt;a href="http://en.wikipedia.org/wiki/Zipf's_law" target="_blank"&gt;Zipfian distribution&lt;/a&gt; (that is, relatively few URLs accumulate most of the traffic), you can provide a rather effective service by ensuring that your URL database covers the most popular URLs. URL-based filtering is rather efficient (if your database is well implemented), and it can easily cover around 95% of the web traffic (in terms of #requests, not in terms or #URLs).&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/zipf.distribution.url.png" style="WIDTH: 450px; DISPLAY: inline; HEIGHT: 342px" height="342" width="450"/&gt;&lt;/p&gt;
&lt;p&gt;However, covering the remaining 5% requires performing some kind of analysis. My target here is dynamically classifying that 5% of web requests (which may account for millions of URLs or even just domains) into two classes: &lt;em&gt;notporn&lt;/em&gt; and &lt;em&gt;porn&lt;/em&gt;. This way, we can cover the 100% of the traffic, and it is likely that we concentrate our classification mistakes (that may be possible at the URL database as well) only into that small 5% - so our filter can be 98% effective or more.&lt;/p&gt;
&lt;p&gt;Why analyzing the URL text? For a matter of &lt;strong&gt;efficiency&lt;/strong&gt; - you do not have to go to the Internet and get the actual &lt;em&gt;Web&lt;/em&gt; content in order to analyze it, so all the processing is local to the proxy and you eventually avoid performing unnecessary Web requests at the proxy itself.&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;Collecting the Dataset&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;So we start with an 880k porn domains list, but although it is possible to learn only from positive examples, we may expect better effectiveness if we collect negative examples (not porn domains). A handy resource is the &lt;a href="http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" target="_blank"&gt;Top 1M Sites&lt;/a&gt; list by &lt;a href="http://www.alexa.com/" target="_blank"&gt;Alexa&lt;/a&gt;, a Web research company that provides this ranked list in a daily basis. Having 1M negative examples and 880k positive examples makes a good class balance and quite populated dataset -- nice for learning, specially when its instances are relatively short text sequences (e.g. &lt;code&gt;google.com&lt;/code&gt; vs. &lt;code&gt;porn.com&lt;/code&gt;).&lt;/p&gt;
&lt;p&gt;First we have to make both lists comparable. The format of the Alexa list is &lt;code&gt;&amp;lt;rank&amp;gt;,&amp;lt;domain&amp;gt;&lt;/code&gt;, while the format of the Squid black list is &lt;code&gt;&amp;lt;dot&amp;gt;&amp;lt;domain&amp;gt;&lt;/code&gt; (in order to match the Squid URL list format). A couple of &lt;code&gt;cut&lt;/code&gt; and &lt;code&gt;sed&lt;/code&gt; commands will do the trick.&lt;/p&gt;
&lt;p&gt;Then we can just add the class and mix the lists.&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;Cleaning the Dataset, first step&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;But... &lt;em&gt;Hey, Internet is for porn!&lt;/em&gt; -- we should expect that some of the URLs in the Alexa ranking are pornographic. In fact, a simple search demonstrate it:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$ grep porn alexa.csv | more
&lt;br/&gt;
pornhub.com
&lt;br/&gt;
youporn.com
&lt;br/&gt;
...
&lt;br/&gt;
$ grep porn alexa.csv | wc -l
&lt;br/&gt;
5719&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;We can just substract the porn list from the Alexa list with a handy grep:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;grep -f porn.csv -v alexa.csv &amp;gt; alexaclean.csv&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;But it takes a loooooong time, so I prefer to sort Alexa list, transforming it to Linux format (as the original one has DOS format), and use &lt;code&gt;comm&lt;/code&gt;:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$ sort alexa.csv &amp;gt; alexasorted.csv
&lt;br/&gt;
$ fromdos alexasorted.csv
&lt;br/&gt;
$ comm -23 alexasorted.csv porn.csv &amp;gt; alexaclean.csv
&lt;br/&gt;
$ wc -l alexaclean.csv
&lt;br/&gt;
975088 alexaclean.csv&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Good point, only 25k URLs are pornographic... Well, lets check:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$ grep porn alexaclean.csv | head
&lt;br/&gt;
001porno.com
&lt;br/&gt;
0dayporn.org
&lt;br/&gt;
1000porno.net
&lt;br/&gt;
...&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;So we still have some porn in there.&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;Cleaning the Dataset, second step&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;Cleaning Alexa list from porn is a bit more complex. How to find those popular porn sites, if they are not even in such a comprehensive list as the Squidblacklist one? Another resource comes to help, and it is the &lt;a href="http://www.pornmd.com/" target="_blank"&gt;sex-related search engine PornMD&lt;/a&gt;. This engine has recently published a list of popular porn searches in the form of a dynamic infography named &lt;a href="http://www.pornmd.com/sex-search" target="_blank"&gt;Global Internet Porn Habits&lt;/a&gt;:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/infography.pornmd.png" style="WIDTH: 450px; DISPLAY: inline; HEIGHT: 319px" height="319" width="450"/&gt;&lt;/p&gt;
&lt;p&gt;So, if you collect a list of the top searches in five of the biggest speaking countries, you get:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/top.searches.porn.png" style="DISPLAY: inline" height="178" width="404"/&gt;&lt;/p&gt;
&lt;p&gt;Cleaning the list from duplicated words, adding "porn", "sex" and "xxx" (rule of thumb), and computing the number of domains they occur in the Alexa (cleaned) and the Squidblacklist lists, we get:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/top.searches.distribution.png" style="DISPLAY: inline" height="338" width="227"/&gt;&lt;/p&gt;
&lt;p&gt;Looking at the list, a relatively safe proportion between the number of occurrences in Squid's versus Alexa's (clean) list is 9 -- this way, we keep most obvious words and remove the most ambiguous ones (although there are some borderline examples, as "asian"). We can see the effects:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$ grep "amateur\|anal\|asian\|creampie\|hentai\|lesbian\|mature\|milf\|squirt\|teen\|porn\|sex\|xxx" alexaclean.csv | wc -l
&lt;br/&gt;
17389
&lt;br/&gt;
$ grep "porn\|sex\|xxx" alexaclean.csv | wc -l
&lt;br/&gt;
12342
&lt;br/&gt;
$ grep -v "amateur\|anal\|asian\|creampie\|hentai\|lesbian\|mature\|milf\|squirt\|teen\|porn\|sex\|xxx" alexaclean.csv &amp;gt; alexacleanfinal.csv
&lt;br/&gt;
$ wc -l alexacleanfinal.csv
&lt;br/&gt;
964735 alexacleanfinal.csv&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;You can see that just "porn", "sex" and "xxx" account for 70,97% of domains, so there is some &lt;strong&gt;domain knowledge&lt;/strong&gt; in the process. I must note I may use another, much more extensive list of porn-related searches like the one featured &lt;a href="http://www.pornmd.com/most-popular" target="_blank"&gt;PornMD Most Popular page&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;Additional Analysis&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;To get a feeling of how the previous porn-related keywords are distributed across the original Alexa ranking, I have computed the number of lines (domains) they occur in 100k intervals, to get the following chart:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://www.esp.uem.es/jmgomez/blogimg/distribution.keywords.intervals.png" style="WIDTH: 450px; DISPLAY: inline; HEIGHT: 287px" height="287" width="450"/&gt;&lt;/p&gt;
&lt;p&gt;Where &lt;code&gt;#query1&lt;/code&gt; represents the number of occurrences of "porn\|sex\|xxx" and &lt;code&gt;#query2&lt;/code&gt; represents the full list of keywords. The growth is nearly linear with an average of 1234.2 URLs per interval in &lt;code&gt;#query1&lt;/code&gt;, and 1738.9 URLs per interval in &lt;code&gt;#query2&lt;/code&gt;. The curves are smooth, and there are more domains in the first intervals (e.g. 1482 hits in the first 100k Alexa URLs for &lt;code&gt;#query1&lt;/code&gt;) than in the latest ones (e.g. 1077 hits in the last 100k Alexa URLs for &lt;code&gt;#query1&lt;/code&gt;).&lt;/p&gt;
&lt;p&gt;There are other dataset statistics that may provide better insights regarding the classification problem, or in other words, that may be more informative or predictive in terms of classification accuracy. For instance:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;What is the length of an average domain name in each category?&lt;/li&gt;
&lt;li&gt;How many points and/or dashes do domain have in average per category?&lt;/li&gt;
&lt;li&gt;Which is the distribution of different TLDs (&lt;a href="http://en.wikipedia.org/wiki/Top-level_domain" target="_blank"&gt;Top Level Domains&lt;/a&gt;) across both categories?&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Can you imagine any other interesting statistics?&lt;/p&gt;
&lt;p&gt;&lt;big&gt;&lt;strong&gt;The Dataset&lt;/strong&gt;&lt;/big&gt;&lt;/p&gt;
&lt;p&gt;Once we have got the original Squidblacklist and the Alexa cleaned one (after substraction and removing the keyword hitting lines), we add some format to get a &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html" target="_blank"&gt;WEKA ARFF&lt;/a&gt; file. For instance, &lt;code&gt;0000free.com&lt;/code&gt; must be transformed into &lt;code&gt;'0000free.com',safe&lt;/code&gt;. A bit of &lt;code&gt;sed&lt;/code&gt; trickery does the job, and then we mix the lists with the following command:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$ paste -d '\n' alexacleanfinal.csv porn.csv &amp;gt; urllist.csv&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The rationale behind mixing the lists is that some learning algorithms are dependent on the order of examples, and for those algorithms it is clever not to expose first all the examples of one class, the other class' ones. As the paste command adds new lines when one of the lists finish, we have to remove double CRs (&lt;code&gt;\n\n&lt;/code&gt;) with another &lt;code&gt;sed&lt;/code&gt; call, and we finally add the ARFF header to get a file starting the following way:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;@relation URLs
&lt;br/&gt;
&lt;br/&gt;
@attribute urltext String
&lt;br/&gt;
@attribute class {safe,porn}
&lt;br/&gt;
&lt;br/&gt;
@data
&lt;br/&gt;
'0000free.com',safe
&lt;br/&gt;
'0000000000000000000sex.com',porn
&lt;br/&gt;
'0000.jp',safe
&lt;br/&gt;
'000000000gratisporno.ontheweb.nl',porn
&lt;br/&gt;
...&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;I have left that file named &lt;code&gt;&lt;a href="https://github.com/jmgomezh/tmweka/blob/master/URLAnalysis/urllist.arff" target="_blank"&gt;urllist.arff&lt;/a&gt;&lt;/code&gt; in &lt;a href="https://github.com/jmgomezh/tmweka" target="_blank"&gt;my GitHub folder&lt;/a&gt; for your convenience, so you can start playing with it. Beware, it is over 40Mb.&lt;/p&gt;
&lt;p&gt;So that is all for the moment. Stay tuned for my next steps if you liked this post.&lt;/p&gt;
&lt;p&gt;Thanks for reading, and please feel free to leave a comment if you think I can improve this article, or you have questions or suggestions for further articles on this topic!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/_4RoihFP-vQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7228929015387305231/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7228929015387305231&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7228929015387305231?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7228929015387305231?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/_4RoihFP-vQ/url-text-classification-with-weka-part.html" title="URL Text Classification with WEKA, Part 1: Data Analysis" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/04/url-text-classification-with-weka-part.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkEFSXwycSp7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-5066094176813687387</id><published>2013-04-08T09:31:00.001+02:00</published><updated>2013-05-02T09:43:38.299+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:43:38.299+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Programming" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>A Simple Text Classifier in Java with WEKA</title><content type="html">&lt;p&gt;In previous posts [&lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;1&lt;/a&gt;, &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html" target="_blank"&gt;2&lt;/a&gt;, &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/04/command-line-functions-for-text-mining.html" target="_blank"&gt;3&lt;/a&gt;], I have shown how to make use of the &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/" target="_blank"&gt;WEKA&lt;/a&gt; classes &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;FilteredClassifier&lt;/a&gt;&lt;/code&gt; and &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/MultiFilter.html" target="_blank"&gt;MultiFilter&lt;/a&gt;&lt;/code&gt; in order to properly build and evaluate a text classifier using WEKA. For this purpose, I have made use of the &lt;a href="http://www.cse.yorku.ca/course_archive/2008-09/W/4412/ExplorerGuide.pdf" target="_blank"&gt;Explorer&lt;/a&gt; GUI provided by WEKA, and its &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/04/command-line-functions-for-text-mining.html" target="_blank"&gt;command-line interface&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;In my opinion, it is a good idea to get familiar with both the Explorer and the command-line interface if you want to get a feeling of the amazing power of this data mining library. However, where you can take full advantage its power is in your own Java programs. Now it is time to deal with it.&lt;/p&gt;
&lt;p&gt;Following &lt;a href="http://dl.acm.org/citation.cfm?id=1095427" target="_blank"&gt;Salton&lt;/a&gt;, and &lt;a href="http://dl.acm.org/citation.cfm?id=138861" target="_blank"&gt;Belkin and Croft&lt;/a&gt;, the process of text classification involves two main steps:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Representing your text database in order to enable learning, and to train a classifier on it.&lt;/li&gt;
&lt;li&gt;Using the classifier to predict text labels of new, unseen documents.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The first step is a batch process, in the sense that you can do it periodically (as long as your labelled data set gets improved with time -- bigger sizes, new labels or categories, corrected predictions via user feedback). The second step is actually the moment in which you get advantage of the knowledge distilled by the learning process, and it is online in the sense that it is don by demand (when new documents arrive). This distinction is conceptual, I mean that modern text classifiers retrain on the added documents as soon as they get them, in order to keep or improve accuracy with time.&lt;/p&gt;
&lt;p&gt;In consequence, what we need to demonstrate the text classification process is &lt;strong&gt;two programs&lt;/strong&gt;: one to &lt;strong&gt;learn&lt;/strong&gt; from the text dataset, and another to use the learnt model to &lt;strong&gt;classify&lt;/strong&gt; new documents. Let us start showing a very simple text learner in Java, using WEKA. The class is named &lt;code&gt;&lt;a href="http://www.esp.uem.es/jmgomez/tmweka/MyFilteredLearner.java" target="_blank"&gt;MyFilteredLearner.java&lt;/a&gt;&lt;/code&gt; , and its &lt;code&gt;main()&lt;/code&gt; method demonstrates its usage, which involves:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Loading the text dataset.&lt;/li&gt;
&lt;li&gt;Evaluating the classifier.&lt;/li&gt;
&lt;li&gt;Training the classifier.&lt;/li&gt;
&lt;li&gt;Storing the classifier.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;The most interesting parts of the process are:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;We read the dataset by simply using the method &lt;code&gt;getData()&lt;/code&gt; of an &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/converters/ArffLoader.ArffReader.html" target="_blank"&gt;ArffReader&lt;/a&gt;&lt;/code&gt; object that wraps a &lt;code&gt;BufferedReader&lt;/code&gt;.&lt;/li&gt;
&lt;li&gt;We programmatically create the classifier by combining a &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector&lt;/a&gt;&lt;/code&gt; filter (in order to represent the texts as feature vectors) and a &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/bayes/NaiveBayes.html" target="_blank"&gt;NaiveBayes&lt;/a&gt;&lt;/code&gt; classifier (for learning), using the &lt;code&gt;FilteredClassifier&lt;/code&gt; class discussed in previous posts.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The process of creating the classifier is demonstrated in the next code snippet:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;trainData.setClassIndex(0);
&lt;br/&gt;
filter = new StringToWordVector();
&lt;br/&gt;
filter.setAttributeIndices("last");
&lt;br/&gt;
classifier = new FilteredClassifier();
&lt;br/&gt;
classifier.setFilter(filter);
&lt;br/&gt;
classifier.setClassifier(new NaiveBayes());&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;So we set the class of the dataset as being the first attribute, then we create the filter and set the attribute to be transformed from text into a feature vector (the last one), and then we create the &lt;code&gt;FilteredClassifier&lt;/code&gt; object and add the previous filter and a new &lt;code&gt;NaiveBayes&lt;/code&gt; classifier to it. Given the attributes above, the dataset has to have the class as the first attribute, and the text as the second (and last) one, like in my typical example of the SMS spam subset example.&lt;/p&gt;
&lt;p&gt;You can execute this class with the following commands to get the following output:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$&amp;gt;javac MyFilteredLearner.java
&lt;br/&gt;
$&amp;gt;java MyFilteredLearner smsspam.small.arff myClassifier.dat
&lt;br/&gt;
===== Loaded dataset: smsspam.small.arff =====
&lt;br/&gt;
&lt;br/&gt;
Correctly Classified Instances 187 93.5 %
&lt;br/&gt;
Incorrectly Classified Instances 13 6.5 %
&lt;br/&gt;
Kappa statistic 0.7277
&lt;br/&gt;
Mean absolute error 0.0721
&lt;br/&gt;
Root mean squared error 0.2568
&lt;br/&gt;
Relative absolute error 25.8792 %
&lt;br/&gt;
Root relative squared error 69.1763 %
&lt;br/&gt;
Coverage of cases (0.95 level) 94 %
&lt;br/&gt;
Mean rel. region size (0.95 level) 51.75 %
&lt;br/&gt;
Total Number of Instances 200
&lt;br/&gt;
&lt;br/&gt;
=== Detailed Accuracy By Class ===
&lt;br/&gt;
&lt;br/&gt;
TP Rate FP Rate Precision Recall F-Measure MCC ROC Area PRC Area Class
&lt;br/&gt;
0,636 0,006 0,955 0,636 0,764 0,748 0,943 0,858 spam
&lt;br/&gt;
0,994 0,364 0,933 0,994 0,962 0,748 0,943 0,986 ham
&lt;br/&gt;
Weighted Avg. 0,935 0,305 0,936 0,935 0,930 0,748 0,943 0,965
&lt;br/&gt;
===== Evaluating on filtered (training) dataset done =====
&lt;br/&gt;
===== Training on filtered (training) dataset done =====
&lt;br/&gt;
===== Saved model: myClassifier.dat =====&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The evaluation has been performed with default values except for the number of folds, that has been set to 4 as shown in the next code snippet:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;Evaluation eval = new Evaluation(trainData);
&lt;br/&gt;
eval.crossValidateModel(classifier, trainData, 4, new Random(1));
&lt;br/&gt;
System.out.println(eval.toSummaryString());&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;For the case you don want to evaluate the classifier on the training data, you can omit the call to the &lt;code&gt;evaluate()&lt;/code&gt; method.&lt;/p&gt;
&lt;p&gt;Now let us deal with the classification program, which is far more complex but only for the process of creating an instance. The class is named &lt;code&gt;&lt;a href="http://www.esp.uem.es/jmgomez/tmweka/MyFilteredClassifier.java" target="_blank"&gt;MyFilteredClassifier.java&lt;/a&gt;&lt;/code&gt; , and its &lt;code&gt;main()&lt;/code&gt; method demonstrates its usage, which involves:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Reading the text to be classified from a file.&lt;/li&gt;
&lt;li&gt;Reading the model or classifier from a file.&lt;/li&gt;
&lt;li&gt;Creating the instance.&lt;/li&gt;
&lt;li&gt;Classifying it.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;Creating the instance is performed in the &lt;code&gt;makeInstance()&lt;/code&gt; method, and its code is the following one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;// Create the attributes, class and text
&lt;br/&gt;
FastVector fvNominalVal = new FastVector(2);
&lt;br/&gt;
fvNominalVal.addElement("spam");
&lt;br/&gt;
fvNominalVal.addElement("ham");
&lt;br/&gt;
Attribute attribute1 = new Attribute("class", fvNominalVal);
&lt;br/&gt;
Attribute attribute2 = new Attribute("text",(FastVector) null);
&lt;br/&gt;
// Create list of instances with one element
&lt;br/&gt;
FastVector fvWekaAttributes = new FastVector(2);
&lt;br/&gt;
fvWekaAttributes.addElement(attribute1);
&lt;br/&gt;
fvWekaAttributes.addElement(attribute2);
&lt;br/&gt;
instances = new Instances("Test relation", fvWekaAttributes, 1);
&lt;br/&gt;
// Set class index
&lt;br/&gt;
instances.setClassIndex(0);
&lt;br/&gt;
// Create and add the instance
&lt;br/&gt;
DenseInstance instance = new DenseInstance(2);
&lt;br/&gt;
instance.setValue(attribute2, text);
&lt;br/&gt;
// instance.setValue((Attribute)fvWekaAttributes.elementAt(1), text);
&lt;br/&gt;
instances.add(instance);&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The classifier learnt with &lt;code&gt;MyFilteredLearner.java&lt;/code&gt; expects that an instance has two attributes: the first one is the class, it is a nominal one with values &lt;code&gt;"spam"&lt;/code&gt; or &lt;code&gt;"ham"&lt;/code&gt;; the second one is a &lt;code&gt;String&lt;/code&gt;, which is the text to be classified. Instead of creating one instance, we create a whole new dataset which first instance is the one that we want to classify. This is required in order to let the classifier know the schema of the dataset, which is stored in the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/Instances.html" target="_blank"&gt;Instances&lt;/a&gt;&lt;/code&gt; object (and not in each instance).&lt;/p&gt;
&lt;p&gt;So first we create the attributes by using the &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/FastVector.html" target="_blank"&gt;FastVector&lt;/a&gt;&lt;/code&gt; class provided by WEKA. The case of the nominal attribute (&lt;code&gt;"class"&lt;/code&gt;) is relatively simple, but the case of the &lt;code&gt;String&lt;/code&gt; one is a bit more complex because it requires the second argument of the constructor to be &lt;code&gt;null&lt;/code&gt;, but casted to &lt;code&gt;FastVector&lt;/code&gt;. Then we create an &lt;code&gt;Instances&lt;/code&gt; object by using a &lt;code&gt;FastVector&lt;/code&gt; to store the two previous attributes, and set the class index to 0 (which means that the first attribute will be the class). As a note, the &lt;code&gt;FastVector&lt;/code&gt; class is deprecated in the WEKA development version.&lt;/p&gt;
&lt;p&gt;The latest step is to create an actual instance. I am using the WEKA development version in this code (as of the date of this post), so we have to use a &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/core/DenseInstance.html" target="_blank"&gt;DenseInstance&lt;/a&gt;&lt;/code&gt; object. However, if you make use of the stable version, then you can use &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.stable/weka/core/Instance.html" target="_blank"&gt;Instance&lt;/a&gt;&lt;/code&gt; (link to the stable version doc), and must change this code to:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;Instance instance = new Instance(2);&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;As a note, I have commented in the code a different way of setting the value of the second attribute. I must note that we do not set the value of the first attribute, as it is unknown.&lt;/p&gt;
&lt;p&gt;The rest of the methods are (more or less) straightforward if you follow the documentation (&lt;a href="http://weka.wikispaces.com/Programmatic+Use" target="_blank"&gt;weka - Programmatic Use&lt;/a&gt;, and &lt;a href="http://weka.wikispaces.com/Use+WEKA+in+your+Java+code" target="_blank"&gt;weka - Use WEKA in your Java code&lt;/a&gt;). You get the class prediction on your text with the following lines:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;double pred = classifier.classifyInstance(instances.instance(0));
&lt;br/&gt;
System.out.println("Class predicted: " + instances.classAttribute().value((int) pred));&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;And if you feed this classifier with a file ( &lt;code&gt;&lt;a href="http://www.esp.uem.es/jmgomez/tmweka/smstest.txt" target="_blank"&gt;smstest.txt&lt;/a&gt;&lt;/code&gt; ) that stores the text &lt;code&gt;"this is spam or not, who knows?"&lt;/code&gt;, and the model learnt with &lt;code&gt;MyFilteredLearner.java&lt;/code&gt; (that is stored in &lt;code&gt;myClassifier.dat&lt;/code&gt;), then you get the following result:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$&amp;gt;javac MyFilteredClassifier.java
&lt;br/&gt;
$&amp;gt;java MyFilteredClassifier smstest.txt myClassifier.dat
&lt;br/&gt;
===== Loaded text data: smstest.txt =====
&lt;br/&gt;
this is spam or not, who knows?
&lt;br/&gt;
===== Loaded model: myClassifier.dat =====
&lt;br/&gt;
===== Instance created with reference dataset =====
&lt;br/&gt;
@relation 'Test relation'
&lt;br/&gt;
&lt;br/&gt;
@attribute class {spam,ham}
&lt;br/&gt;
@attribute text string
&lt;br/&gt;
&lt;br/&gt;
@data
&lt;br/&gt;
?,' this is spam or not, who knows?'
&lt;br/&gt;
===== Classified instance =====
&lt;br/&gt;
Class predicted: ham&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;It is interesting to see that the class assigned to the instance before classifying it is &lt;code&gt;"?"&lt;/code&gt;, which means &lt;em&gt;undefined&lt;/em&gt; or &lt;em&gt;unknown&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;For those interested on using the classifiers discussed in my previous posts (I mean including &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;AttributeSelection&lt;/a&gt;&lt;/code&gt; , and using &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/rules/PART.html" target="_blank"&gt;PART&lt;/a&gt;&lt;/code&gt; and &lt;code&gt;&lt;a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/functions/SMO.html" target="_blank"&gt;SMO&lt;/a&gt;&lt;/code&gt; as classifiers), the only part of this code that you have to change is the &lt;code&gt;learn()&lt;/code&gt; and &lt;code&gt;evaluate()&lt;/code&gt; methods in &lt;code&gt;MyFilteredLearner.java&lt;/code&gt;. Just play with it, and have fun.&lt;/p&gt;
&lt;p&gt;Thanks for reading, and please feel free to leave a comment if you think I can improve this article, or you have questions or suggestions for futher articles on this topic!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/iVZclG19ki0" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/5066094176813687387/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=5066094176813687387&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/5066094176813687387?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/5066094176813687387?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/iVZclG19ki0/a-simple-text-classifier-in-java-with.html" title="A Simple Text Classifier in Java with WEKA" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/04/a-simple-text-classifier-in-java-with.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkAHQXY8cCp7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-2381881892090903032</id><published>2013-04-01T18:21:00.000+02:00</published><updated>2013-05-02T09:45:30.878+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:45:30.878+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Opensource" /><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Command Line Functions for Text Mining in WEKA</title><content type="html">&lt;p&gt;In previous posts I have explained &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;how to chain filters and classifiers in WEKA&lt;/a&gt;, in order to avoid incorrect results when evaluating text classifiers by using cross-fold validation, and &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html" target="_blank"&gt;how to integrate feature selection in the text classification process&lt;/a&gt;. For this purpose, I have used the &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;FilteredClassifier&lt;/a&gt; and the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/MultiFilter.html"&gt;MultiFilter&lt;/a&gt; in the &lt;a href="http://www.cse.yorku.ca/course_archive/2008-09/W/4412/ExplorerGuide.pdf" target="_blank"&gt;Explorer&lt;/a&gt; GUI provided by &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/" target="_blank"&gt;WEKA&lt;/a&gt;. Now it is time to do so in the command line.&lt;/p&gt;
&lt;p&gt;WEKA essentially provides three usage modes:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Using the Explorer, and other GUIs like the &lt;a href="http://www.cse.yorku.ca/course_archive/2006-07/W/4412/doc/weka/ExperimenterTutorial-3.5.5.pdf" target="_blank"&gt;Experimenter&lt;/a&gt;, which allow to setup experiments and to examine the results graphically.&lt;/li&gt;
&lt;li&gt;Using the command line functions, which allow to setup filters, classifiers and clusterers with plenty of configuration options.&lt;/li&gt;
&lt;li&gt;Using the classes programmatically, that is, in your own programs in Java.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;One major difference between modes 1 and 2 is that in the first mode, you spend some of the memory in the GUI, while in the second one, you do not. That can be a significant difference when you load big datasets. In both cases you can control the memory assigned to WEKA using Java command line options like &lt;code&gt;-Xms&lt;/code&gt;, &lt;code&gt;-Xms&lt;/code&gt; and so, but it may be interesting to save the memory used in the graphic elements in order to be able to deal with bigger datasets.&lt;/p&gt;
&lt;p&gt;I will deal with the usage of WEKA in your programs in the future, in this post I focus on the command line. Before trying the following examples, please ensure &lt;code&gt;weka.jar&lt;/code&gt; is added to your &lt;code&gt;CLASSPATH&lt;/code&gt;. The first thing we must know is that WEKA filters and classifiers can be called in the command line, and that the call without arguments will show their configuration options. For instance, when you call a rule learner like &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/rules/PART.html" target="_blank"&gt;PART&lt;/a&gt; (which I used in my previous posts), you get the following options:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt;java weka.classifiers.rules.PART
&lt;br/&gt;
Weka exception: No training file and no object input file given.
&lt;br/&gt;
General options:
&lt;br/&gt;
-h or -help
&lt;br/&gt;
Output help information.
&lt;br/&gt;
-synopsis or -info
&lt;br/&gt;
Output synopsis for classifier (use in conjunction with -h)
&lt;br/&gt;
-t &amp;lt;name of training file&amp;gt;
&lt;br/&gt;
Sets training file.
&lt;br/&gt;
-T &amp;lt;name of test file&amp;gt;
&lt;br/&gt;
Sets test file. If missing, a cross-validation will be performed
&lt;br/&gt;
on the training data.
&lt;br/&gt;
...
&lt;br/&gt;
Options specific to weka.classifiers.rules.PART:
&lt;br/&gt;
-C &amp;lt;pruning confidence&amp;gt;
&lt;br/&gt;
Set confidence threshold for pruning.
&lt;br/&gt;
(default 0.25)
&lt;br/&gt;
...&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;I omit the full list of options. Options are divided into two groups, those that are accepted by any classifier and those specific to the PART classifier. General options include three usage modes:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Evaluating the classifier on the training collection it self, possibly using cross validation, or on a test collection.&lt;/li&gt;
&lt;li&gt;Training a classifier and storing the model in a file for further use.&lt;/li&gt;
&lt;li&gt;Training a classifier and getting its output (classification of instances) on a test collection.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;However, when calling a filter in the command line, the input file (the dataset) is read from the standard input, so you have to redirect the input from your file by using the appropriate operator (&lt;code&gt;&amp;lt;&lt;/code&gt;), or to use the option &lt;code&gt;-h&lt;/code&gt; to get the options of the filter.&lt;/p&gt;
&lt;p&gt;In my previous post on chaining filters and classifiers, I performed an experiment running a PART classifier on an ARFF-formatted subset of the &lt;a href="http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/" target="_blank"&gt;SMS Spam Collection&lt;/a&gt;, namely the &lt;code&gt;smsspam.small.arff&lt;/code&gt; file. As every instance is of the form &lt;code&gt;[spam|ham],"message text"&lt;/code&gt;, we have to transform the text of the message into a term weight vector by using the StringToWordVector filter. You can combine the filter and the classifier evaluation into one command by using the FilteredClassifier class as in the following command:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;$&amp;gt;java weka.classifiers.meta.FilteredClassifier -t smsspam.small.arff -c 1 -x 3 -v -o -F weka.filters.unsupervised.attribute.StringToWordVector -W weka.classifiers.rules.PART&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;To get the following output:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;=== Stratified cross-validation ===
&lt;br/&gt;
Correctly Classified Instances 173 86.5 %
&lt;br/&gt;
Incorrectly Classified Instances 27 13.5 %
&lt;br/&gt;
Kappa statistic 0.4181
&lt;br/&gt;
Mean absolute error 0.1625
&lt;br/&gt;
Root mean squared error 0.3523
&lt;br/&gt;
Relative absolute error 58.2872 %
&lt;br/&gt;
Root relative squared error 94.9031 %
&lt;br/&gt;
Total Number of Instances 200
&lt;br/&gt;
&lt;br/&gt;
=== Confusion Matrix ===
&lt;br/&gt;
&lt;br/&gt;
a b &amp;lt;-- classified as
&lt;br/&gt;
13 20 | a = spam
&lt;br/&gt;
7 160 | b = ham&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Which is exactly the one I showed &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;in my previous post&lt;/a&gt;. I have used the following general options:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;-t smsspam.small.arff&lt;/code&gt; to specify the dataset to train (and on default, to evaluate on by using cross-validation).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;-c 1&lt;/code&gt; to specify the first attribute as the class.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;-x 3&lt;/code&gt; to specify that the number of folds to be used in the cross-validation evaluation is 3.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;-v&lt;/code&gt; and &lt;code&gt;-o&lt;/code&gt; to avoid outputting the classifiers and statistics on the training collection, respectively.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Plus the specific options of the FilteredClassifier &lt;code&gt;-F&lt;/code&gt; to define the filter, and &lt;code&gt;-W&lt;/code&gt; to define the classifier.&lt;/p&gt;
&lt;p&gt;In my subsequent post on chaining filters, I proposed to make use of attribute selection to improve the representation of our learning problem. This can be done by issuing the following command:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;$&amp;gt;java weka.classifiers.meta.FilteredClassifier -t smsspam.small.arff -c 1 -x 3 -v -o -F "weka.filters.MultiFilter -F weka.filters.unsupervised.attribute.StringToWordVector -F \"weka.filters.supervised.attribute.AttributeSelection -E weka.attributeSelection.InfoGainAttributeEval -S \\\"weka.attributeSelection.Ranker -T 0.0\\\"\"" -W weka.classifiers.rules.PART&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;To get the following output:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;code&gt;&lt;code&gt;=== Stratified cross-validation ===
&lt;br/&gt;
Correctly Classified Instances 167 83.5 %
&lt;br/&gt;
Incorrectly Classified Instances 33 16.5 %
&lt;br/&gt;
Kappa statistic 0.1959
&lt;br/&gt;
Mean absolute error 0.1967
&lt;br/&gt;
Root mean squared error 0.38
&lt;br/&gt;
Relative absolute error 70.53 %
&lt;br/&gt;
Root relative squared error 102.3794 %
&lt;br/&gt;
Total Number of Instances 200
&lt;br/&gt;
&lt;br/&gt;
=== Confusion Matrix ===
&lt;br/&gt;
&lt;br/&gt;
a b &amp;lt;-- classified as
&lt;br/&gt;
6 27 | a = spam
&lt;br/&gt;
6 161 | b = ham&lt;/code&gt;&lt;/code&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Which in turn, it is the same I got &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html" target="_blank"&gt;in that post&lt;/a&gt;. If we replace PART by the &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/functions/SMO.html" target="_blank"&gt;SMO&lt;/a&gt; implementation of Support Vector Machines included in WEKA (by changing &lt;code&gt;weka.classifiers.rules.PART&lt;/code&gt; to &lt;code&gt;weka.classifiers.functions.SMO&lt;/code&gt;), we get the accuracy figure of 91%, as described in the post.&lt;/p&gt;
&lt;p&gt;While most of the options are the same as in the previous command, two things deserve special attention in this one:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;We chain the &lt;a href="http://weka.sourceforge.net/doc/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector&lt;/a&gt; and the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;AttributeSelection&lt;/a&gt; filters by using the MultiFilter described in the previous post. The order of calls is obviously relevant, as we first need to tokenize the messages into words, and then selecting the most informative words. Moreover, while we apply StringToWordVector with the default options, the AttributeSelection filter makes use of the &lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/InfoGainAttributeEval.html" target="_blank"&gt;InfoGainAttributeEval&lt;/a&gt; function as quality metric, and the &lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/Ranker.html" target="_blank"&gt;Ranker&lt;/a&gt; class as the search method. The Ranker class is applied with the option &lt;code&gt;-T 0.0&lt;/code&gt; in order to specify that the filter has to rank the attributes (words or tokens) according to the quality metric, but to keep only which score is over the threshold defined by T, that is 0.0.&lt;/li&gt;
&lt;li&gt;As the order of options is not relevant, it is required to link the options to the appropriate class by using the quotation mark symbol ("). Unfortunately, we have three nested expressions:&lt;/li&gt;
&lt;li style="LIST-STYLE-TYPE: none"&gt;
&lt;ul class="noindent"&gt;
&lt;li&gt;The whole MultiFilter filter, enclosed by the isolated quotation marks (").&lt;/li&gt;
&lt;li&gt;The AttributeSelection filter, enclosed by the escaped quotation mark (\").&lt;/li&gt;
&lt;li&gt;The Ranker search method, enclosed by the double escaped quotation mark (\\\"). Here we escape the escape symbol itself (\) along with the quotation mark.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li style="LIST-STYLE-TYPE: none"&gt;So many escaping symbols make it a bit &lt;em&gt;dirty&lt;/em&gt;, but still functional.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Si I have shown how we can chain filters and classifiers, and apply several chained filters as well, in the command line. In next posts I will explain how to train, store and then evaluate a classifier by using the command line, and how to make use of WEKA filters and classifiers in your Java programs.&lt;/p&gt;
&lt;p&gt;Thanks for reading, and please feel free to leave a comment if you think I can improve this article!&lt;/p&gt;
&lt;p&gt;NOTE: You can find the collection I used in this post, along with other stuff related to WEKA and text mining in my &lt;a href="http://www.esp.uem.es/jmgomez/tmweka/" target="_blank"&gt;Text Mining in WEKA&lt;/a&gt; page.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/2VzrjQ5GuCQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/2381881892090903032/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=2381881892090903032&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/2381881892090903032?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/2381881892090903032?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/2VzrjQ5GuCQ/command-line-functions-for-text-mining.html" title="Command Line Functions for Text Mining in WEKA" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/04/command-line-functions-for-text-mining.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck8FQn0zfip7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-6652486140098698861</id><published>2013-02-11T10:50:00.001+01:00</published><updated>2013-05-02T09:46:53.386+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:46:53.386+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Opensource" /><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters</title><content type="html">&lt;p&gt;Two weeks ago, I wrote &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;a post on how to chain filters and classifiers in WEKA&lt;/a&gt;, in order to avoid misleading results when performing experiments with text collections. The issue was that, when using &lt;a href="http://en.wikipedia.org/wiki/Cross-validation" target="_blank"&gt;N Fold Cross Validation&lt;/a&gt; (CV) in your data, you should not apply the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector&lt;/a&gt; (STWV) filter on the full data collection and then perform the CV evaluation on your data, because you would be using words that are present in your test subset (but not in your training subset) for each run. Moreover, the STWV filter can extract and use simple statistics to filter out the terms (e.g. minimum number of occurrences), but those statistics over the full collection are not valid because in each CV run you use only a subset of it.&lt;/p&gt;
&lt;p&gt;Now I would like to deal with a more general setting in which you want to apply &lt;strong&gt;&lt;a href="http://en.wikipedia.org/wiki/Dimension_reduction" target="_blank"&gt;dimensionality reduction&lt;/a&gt;&lt;/strong&gt; because, in general text classification tasks, the documents or examples are represented by hundreds (if not thousands) of tokens, what makes the classification problem very hard for many learners. In &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/" target="_blank"&gt;WEKA&lt;/a&gt;, this involves using the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;AttributeSelection&lt;/a&gt; filter along with the STWV one. Before thinking about dimensionality reduction, we must reflect a bit about it.&lt;/p&gt;
&lt;p&gt;Dimensionality reduction is a typical step in many data mining problems, which involves transforming our data representation (the schema of our table, the list of current attributes) into a shorter, more compact, and hopefully, more predictive one. Basically, this can be done in two ways:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;With &lt;strong&gt;feature reduction&lt;/strong&gt;, which maps the original representation (list of attributes) onto a new and more compact one. The new attributes are synthetic, that is, they somehow combine the information from subsets of the original ones which share statistical properties. Typical feature reduction techniques include algebraic analysis methods like &lt;a href="http://en.wikipedia.org/wiki/Principal_component_analysis" target="_blank"&gt;Principal Component Analysis&lt;/a&gt; (PCA) and &lt;a href="http://en.wikipedia.org/wiki/Singular_value_decomposition" target="_blank"&gt;Singular Value Decomposition&lt;/a&gt; (SVD). In text analysis, the most popular method is, by far, &lt;a href="http://en.wikipedia.org/wiki/Latent_semantic_indexing" target="_blank"&gt;Latent Semantic Analysis&lt;/a&gt;, which involves obtaining the principal components or buckets into the term-to-document sparse matrix.&lt;/li&gt;
&lt;li&gt;With &lt;strong&gt;feature selection&lt;/strong&gt;, which just selects a subset of the original representation attributes, according to some Information Theory quality metric like &lt;a href="http://en.wikipedia.org/wiki/Information_gain_in_decision_trees" target="_blank"&gt;Information Gain&lt;/a&gt; or &lt;a href="http://en.wikipedia.org/wiki/Chi-squared_distribution" target="_blank"&gt;X^2 (Chi-Square)&lt;/a&gt;. This method can be far more simple and less time consuming than the previous one, as you only have to compute the value of the metric for each attribute, and rank the attributes. Then you simply decide a threshold in the metric (e.g. 0 for Information Gain) and keep the attributes with a value over it. Alternatively, you can choose a percentage of the number of original attributes (e.g. 1% and 10% are typical numbers in text classification), and just keep those top ranking ones. However, there are other more time consuming alternatives, like exploring the predictive power of subsets of attributes using search algorithms.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;A major difference between both methods is that feature reduction leads to &lt;em&gt;synthetic&lt;/em&gt; attributes, but feature selection just keeps some of the original ones. This may affect the ability of the data scientist to understand the results, as synthetic attributes can be statistically relevant but meaningless. Another difference is that feature reduction does not make use of the &lt;em&gt;class information&lt;/em&gt;, while feature selection does. In consequence, the second method is very likely to lead to a more predictive subset of attributes than the original one. But beware, more theoretical predictive power does not always mean more effectiveness. I recommend to read the old (?) but always helpful &lt;a href="http://dl.acm.org/citation.cfm?id=657137" target="_blank"&gt;paper by Yimming Yang &amp;amp; Jan Pedersen&lt;/a&gt; on the topic.&lt;/p&gt;
&lt;p&gt;The WEKA package supports both methods, mainly with the &lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/PrincipalComponents.html" target="_blank"&gt;weka.attributeSelection.PrincipalComponents&lt;/a&gt; (feature reduction) and &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/supervised/attribute/AttributeSelection.html" target="_blank"&gt;weka.filters.supervised.attribute.AttributeSelection&lt;/a&gt; (feature selection) filters. But an important question is: Do you really need to make dimensionality reduction in text analysis? There are two clear arguments against it:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Some algorithms get no hurt with using all the features, even if they are really many and very sparse. For instance, Support Vector Machines excel in text classification problems exactly for that: they are able to deal with thousands of attributes, and they get better results when no reduction is performed. A typical text classification problem in which dimensionality reduction can be a big mistake is spam filtering.&lt;/li&gt;
&lt;li&gt;If it is a matter of computing time, like e.g. in symbolic learners like decision trees (C4.5) or rules (Ripper), then there is no worry. Big Data techniques come to help, as you can configure cheap and big clusters over e.g. Hadoop to perform your computations!&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;But having the algorithms in my favourite data analysis package, and knowing that sometimes they lead to effectiveness improvements, why not using them?&lt;/p&gt;
&lt;p&gt;Because of the reasons above, I will focus on feature selection. In consequence, I will deal with the AttributeSelection filter, leaving the PrincipalComponents one for another post. Let us start with the same text collection that I used in my previous post about chaining filters and classifiers in WEKA. It is an small subset of the &lt;a href="http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/" target="_blank"&gt;SMS Spam Collection&lt;/a&gt;, made with the first 200 messages for brevity and simplicity.&lt;/p&gt;
&lt;p&gt;Our goal is to perform a 3-fold CV experiment with any algorithm in WEKA. But, in order to do it correctly, we know we must chain the STWV filter with the classifier by using the FilteredClassifier learner in WEKA. However, we want to perform feature selection as well, and the FilteredClassifier allows us to chain a single filter and a single classifier. So, how to combine both the STWV and the AttributeSelection filters into a single one?&lt;/p&gt;
&lt;p&gt;Let us start doing it manually. After loading the dataset into the WEKA Explorer, applying the STWV filter with the default settings, and setting the class attribute to the "spamclass" one, we get something like this:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-aVqAh2gsXS0/URirr3HJB5I/AAAAAAAABlw/Chtd-kGNXGs/s800/weka01.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;Now we can either go to the "Select attributes" tab, or just stay in the "Preprocess" tab and choose the AttributeSelection filter. I opt for the second way, so you can browse the filters folder by clicking on the "Choose" button at the "Filters" area. After selecting the "weka &amp;gt; filters &amp;gt; supervised &amp;gt; attribute &amp;gt; AttributeSelection", you can see the selected filter in the "Filters" area, as shown in the next picture:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh6.googleusercontent.com/-Ru7jWvVqFc8/URirsCXFAvI/AAAAAAAABl8/zZdeU8KMQgI/s800/weka02.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;In order to set up the filter, we can click on the name of the filter. The "weka.gui.GenericObjectEditor" window we get is a generic window that allows to configure filters, classifiers, etc. according to a number of object-defined properties. In this case, it allows us to set up the AttributeSelection filter configuration options, which are:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/AttributeEvaluator.html" target="_blank"&gt;evaluator&lt;/a&gt;, which is the quality metric we use to evaluate the predictive properties of an attribute or a set of them. There you can choose among a wide number of them (which depends on your WEKA version), including specially Chi Square (&lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/ChiSquaredAttributeEval.html" target="_blank"&gt;ChiSquaredAttributeEval&lt;/a&gt;), Information Gain (&lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/InfoGainAttributeEval.html" target="_blank"&gt;InfoGainAttributeEval&lt;/a&gt;), and Gain Ratio (&lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/GainRatioAttributeEval.html" target="_blank"&gt;GainRatioAttributeEval&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;The &lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/ASSearch.html" target="_blank"&gt;search algorithm&lt;/a&gt;, which is the way we will select the remaining group of attributes, and includes very clever but time consuming group search algorithms, and my favourite one, the Ranker (&lt;a href="http://weka.sourceforge.net/doc/weka/attributeSelection/Ranker.html" target="_blank"&gt;weka.attributeSelection.Ranker&lt;/a&gt;). This one just ranks the attributes according to the chosen quality metric, and keeps those meeting some criterion (like e.g. having a value over a predefined threshold).&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;In the next picture, you can see the AttributeSelection configuration window with the evaluator set up to Information Gain, and the search set up as Ranker, with the default options.&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-T1b2VbGK7j8/URirsViyp5I/AAAAAAAABl0/kw5Up1j3vi4/s465/weka03.PNG" style="TEXT-ALIGN: center; WIDTH: 350px; DISPLAY: block; HEIGHT: 185px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="185" width="350"/&gt;&lt;/p&gt;
&lt;p&gt;The Ranker evaluator has two main properties:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The &lt;em&gt;numToSelect&lt;/em&gt; property, which defines the number of attributes to keep, an Integer number that is -1 (all) by default.&lt;/li&gt;
&lt;li&gt;The &lt;em&gt;threshold&lt;/em&gt; property, which defines the minimum value that an attribute has to get in the evaluator in order to be kept. The default value for this property is the minimum Long integer in Java.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;In consequence, if we want to keep those attributes scoring over 0, we have just to write that number in the threshold area of the window we get when we click on the Ranker at the previous window:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-i_NBv6nmAmI/URirs2mLekI/AAAAAAAABmE/rJ1zMMRhEz8/s435/weka04.PNG" style="TEXT-ALIGN: center; WIDTH: 350px; DISPLAY: block; HEIGHT: 247px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="247" width="350"/&gt;&lt;/p&gt;
&lt;p&gt;By clicking OK on all the previous windows, we get a configuration of the AttributeSelection filter which involves keeping those attributes with Information Gain score over 0. If we apply that filter to our current collection, we get the following result:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh6.googleusercontent.com/-beT4BfclLfM/URirtSCg3VI/AAAAAAAABmI/DiGsDjn0l7U/s800/weka05.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;As you can see, we get a ranked list of 82 attributes (plus the class one), in which the top scoring attribute is the token "to". This attribute occurs in 69 messages (value 1), but many of them are spam ones, so it is quite predictive for this particular class. We can see as well that we only keep a 5.93% of the original attributes (82 over 1382).&lt;/p&gt;
&lt;p&gt;Now we can go to the "Classify" tab and select the rule learner PART ("weka &amp;gt; classifiers &amp;gt; rules &amp;gt; PART") to be evaluated on the training collection itself ("Test options" area, "Use training set option"), getting the next result:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh3.googleusercontent.com/--5_RJkc4KcU/URirtrncGAI/AAAAAAAABmY/qKJN90Unj58/s800/weka06.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 300px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="300" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;We get an accuracy of 95.5%, much better than &lt;a href="http://jmgomezhidalgo.blogspot.com.es/2013/01/text-mining-in-weka-chaining-filters.html" target="_blank"&gt;the results I reported in my previous post&lt;/a&gt;. Of course, these results cannot be compared because this quick experiment is a test on the training collection, not done with 3-fold CV and the FilteredClassifier. But if we want to run a CV experiment, how to do it as we have 2 filters instead of one, in our set up?&lt;/p&gt;
&lt;p&gt;What we need now is to start with the original text collection in &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html" target="_blank"&gt;ARFF format&lt;/a&gt; (no STWV yet), and to use the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/MultiFilter.html" target="_blank"&gt;MultiFilter&lt;/a&gt; that WEKA provides for these situations. We start then with the original collection, and go to the "Classify" tab. If we try to choose any classic learner (&lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/trees/J48.html" target="_blank"&gt;J48 for the C4.5 decision tree learner&lt;/a&gt;, &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/functions/SMO.html" target="_blank"&gt;SMO for Support Vector Machines&lt;/a&gt;, etc.), it will be impossible because we have just one attribute (the text of the SMS messages) along with the class, but we can use the &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;weka.classifiers.meta.FilteredClassifier&lt;/a&gt;. After selecting it, we will see something similar to the next picture:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh3.googleusercontent.com/-4afnYiVvy2I/URiruOiStAI/AAAAAAAABmU/n8yKF72A1H4/s800/weka07.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 300px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="300" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;If we click on the name of the classifier at the "Classifier" area and we select &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/rules/PART.html" target="_blank"&gt;weka.classifiers.rules.PART&lt;/a&gt; as the classifier (with default options), we get the next set up in the FilteredClassifier editor window:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-4XZFdmv8zvs/URiruLNDKGI/AAAAAAAABmQ/cI6tfCaGkaY/s465/weka08.PNG" style="TEXT-ALIGN: center; WIDTH: 350px; DISPLAY: block; HEIGHT: 208px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="208" width="350"/&gt;&lt;/p&gt;
&lt;p&gt;Then we can choose the weka.filters.MultiFilter in the filter area, which starts with a dummy AllFilter. Time to set up our filter combining STWV and AttributeSelection. We click on the filter name area and we get a new filter edition window with an area to define the filters to be applied. If we click on it, we get a new window that allows to add, configure and delete filters. The selected filters will be applied in the order we add them, so we start deleting the AllFilter and adding a STWV filter with the default options, getting something similar to the next picture:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh3.googleusercontent.com/-iFaiyK_72F0/URiruwTjqAI/AAAAAAAABms/4qwN0tY_rEU/s260/weka09.PNG" style="TEXT-ALIGN: center; WIDTH: 260px; DISPLAY: block; HEIGHT: 194px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="194" width="260"/&gt;&lt;/p&gt;
&lt;p&gt;Filters are added by clicking on the "Choose" button to select them, and clicking on the "Add" button to add them to the list. We can now add the AttributeSelection filter with the Information Gain evaluator and the Ranker with threshold 0 search, by editing the filter when clicking on the "Edit" button with the AttributeSelection filter selected in the list. If you manually re-dimension the window, you can see a set up similar to this one:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-1vKcKDynrhE/URirvPPpFpI/AAAAAAAABmk/mtPvr2JHBO0/s629/weka10.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 146px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="146" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;The set up is nearly finished. We close this window by clicking on the "X" button, and click on the "OK" button at the MultiFilter and FilteredClassifier windows. In the "Classify" tab at the explorer, we select "Cross-validation" in the "Test options" area, entering 3 as the number of folds, and we select the class attribute as "spamclass". Having done this, we can just click on the "Start" button to get the next result:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh6.googleusercontent.com/-fpzIgIh2h04/URirvv9Db8I/AAAAAAAABmo/oHaVNTGT_RI/s800/weka11.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 300px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="300" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;So we get an accuracy of 83.5%, which is worse than the one we got without using feature selection (which was 86.5%). Oh oh, all this clever (?) set up to get a drop of 3 points in accuracy! :-(&lt;/p&gt;
&lt;p&gt;But what happens if, instead of using a relatively weak learner on text problems like PART, we turn to Support Vector Machines? WEKA includes the &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/functions/SMO.html" target="_blank"&gt;weka.classifiers.functions.SMO&lt;/a&gt; classifier, which implements &lt;a href="http://dl.acm.org/citation.cfm?id=299105" target="_blank"&gt;John Platt's sequential minimal optimization algorithm&lt;/a&gt; for training a support vector classifier. If we choose this classifier with default options, we get a quite different results:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Using only the STWV filter, we get an accuracy of 90.5% with 18 spam messages classified as legitimate ("ham"), and 1 false positive.&lt;/li&gt;
&lt;li&gt;Using the MultiFilter with AttributeSelection in the same setup, we get an accuracy of 91% with 16 spam messages classified as ham, and 2 false positives.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;So we get an improvement of accuracy on a more accurate learner, what is nice. However, the difference is just 0.5% (1 message in our 200 instances collection), so it is moderate. Moreover, we get one more false positive, what is bad for this particular problem. In spam filtering, it is much worse to make a false positive (sending a legitimate message to the spam folder) than the opposite, because the user has the risk to miss an important message. Check &lt;a href="http://dl.acm.org/citation.cfm?id=508911" target="_blank"&gt;my paper on cost sensitive evaluation of spam filtering at ACM SAC 2002&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;But all in all, I expect this post shows the merits of feature selection in text classification problems, and how to do it with my favourite library, WEKA. Thanks for reading, and please feel free to leave a comment if you think I can improve this article!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/3j3dDF2bAZA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/6652486140098698861/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=6652486140098698861&amp;isPopup=true" title="4 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6652486140098698861?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6652486140098698861?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/3j3dDF2bAZA/text-mining-in-weka-revisited-selecting.html" title="Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://lh5.googleusercontent.com/-aVqAh2gsXS0/URirr3HJB5I/AAAAAAAABlw/Chtd-kGNXGs/s72-c/weka01.PNG" height="72" width="72" /><thr:total>4</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/02/text-mining-in-weka-revisited-selecting.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck8MQHo8eip7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-3503879997074045923</id><published>2013-01-29T13:21:00.001+01:00</published><updated>2013-05-02T09:48:01.472+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:48:01.472+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Opensource" /><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Text Mining in WEKA: Chaining Filters and Classifiers</title><content type="html">&lt;p&gt;One of the most interesting features of &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/" target="_blank"&gt;WEKA&lt;/a&gt; is its flexibility for text classification. Over the years, I have had the chance to make a lot of experiments on text collections with WEKA, most of them in &lt;a href="http://en.wikipedia.org/wiki/Supervised_learning" target="_blank"&gt;supervised tasks&lt;/a&gt; that are commonly mentioned as &lt;a href="http://en.wikipedia.org/wiki/Document_classification" target="_blank"&gt;Text Categorization&lt;/a&gt;, that is, classifying text segments (documents, paragraphs, collocations) into a set of predefined classes. Examples of Text Categorization tasks include assigning topics labels to news items, classifying email messages into folders, or, more close to my research, classifying messages as spam or not (&lt;a href="http://en.wikipedia.org/wiki/Bayesian_spam_filtering" target="_blank"&gt;Bayesian spam filters&lt;/a&gt;) and web pages as inappropriate or not (e.g. pornographic content vs. educational resources).&lt;/p&gt;
&lt;p&gt;WEKA support for Text Categorization is &lt;em&gt;impressive&lt;/em&gt;. A prominent feature is that this package supports breaking text utterances into indexing terms (word stems, collocations) and assigning them a weight in term vectors, a required step in nearly every text classification task. This tokenization and indexing process is achieved by using a super-flexible filter named &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector&lt;/a&gt;. Lets me show an example of how it works.&lt;/p&gt;
&lt;p&gt;I will start with a simple text collection, which is an small sample of the publicly available &lt;a href="http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/" target="_blank"&gt;SMS Spam Collection&lt;/a&gt;. Some colleagues and me built this collection for experimenting with Bayesian SMS spam filters, and it contains 4,827 legitimate messages and 747 mobile spam messages, for a total of 5,574 short messages collected from several sources. I will make use of an small subset in order to better show my points in this post. The subset is made with the first 200 messages, and it is the following one right formatted in the suitable WEKA ARFF format:&lt;/p&gt;
&lt;blockquote style="MARGIN-RIGHT: 0px" dir="ltr"&gt;
&lt;p&gt;@relation sms_test&lt;/p&gt;
&lt;p&gt;@attribute spamclass {spam,ham}
&lt;br/&gt;
@attribute text String&lt;/p&gt;
&lt;p&gt;@data
&lt;br/&gt;
ham,'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
&lt;br/&gt;
ham,'Ok lar... Joking wif u oni...'
&lt;br/&gt;
spam,'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;amp;C\'s apply 08452810075over18\'s'
&lt;br/&gt;
ham,'U dun say so early hor... U c already then say...'
&lt;br/&gt;
ham,'Nah I don\'t think he goes to usf, he lives around here though'
&lt;br/&gt;
spam,'FreeMsg Hey there darling it\'s been 3 week\'s now and no word back! I\'d like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv'
&lt;br/&gt;
...
&lt;br/&gt;
ham,'Hi its Kate how is your evening? I hope i can see you tomorrow for a bit but i have to bloody babyjontet! Txt back if u can. :) xxx'&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;In the first 200 messages of the collection, 33 of them are spam and 167 are legitimate ("ham"). This collection can be loaded in the &lt;a href="https://www.google.es/url?sa=t&amp;amp;rct=j&amp;amp;q=&amp;amp;esrc=s&amp;amp;source=web&amp;amp;cd=1&amp;amp;cad=rja&amp;amp;ved=0CDAQFjAA&amp;amp;url=http://www.cse.yorku.ca/course_archive/2008-09/W/4412/ExplorerGuide.pdf&amp;amp;ei=NLwHUY2FBMSYhQed7oG4Cg&amp;amp;usg=AFQjCNGMB6VSKlDT54vaURKZUzpE84JzSA&amp;amp;sig2=XqaJy2aFRWyNEb8skoVbcw&amp;amp;bvm=bv.41524429,d.ZG4" target="_blank"&gt;WEKA Explorer&lt;/a&gt;, showing something similar to the following window:&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="https://lh6.googleusercontent.com/-X1T58FONe78/UQey9KzvS_I/AAAAAAAABkM/nn6PVpXg9J4/s735/wekaexample01.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;The point is that messages are featured as string attributes, so you have to break them into words in order to allow learning algorithms to induce classifiers with rules like:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;if&lt;/strong&gt; ("urgent" &lt;strong&gt;in&lt;/strong&gt; message) &lt;strong&gt;then&lt;/strong&gt; class(message) == spam&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Here is where the StringToWordVector filter comes to help. You can just select it by clicking the "Choose" button in the "Filter" area, and browsing the folders to "weka &amp;gt; filters &amp;gt; unsupervised &amp;gt; attribute" one. Once selected, you should be able to see something like this:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh6.googleusercontent.com/-gzV8Vf_venI/UQey9O3vPFI/AAAAAAAABkM/q3KG3PpAF6s/s735/wekaexample02.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;If you click on the name of the filter, you will get a lot of options, which I leave for another post. For the my goals in this one, you can just apply this filter with the default options to get an indexed collection of 200 messages and 1,382 indexing tokens (plus the class attribute), shown in the next picture:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-t09zkp9O55c/UQey9LSa0pI/AAAAAAAABkM/CwWsNKVkvI0/s735/wekaexample03.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;If you want to see colors showing the distribution of attributes (tokens) according to the class, you can just select the "class" attribute as the class for the collection in the bottom-left area of the WEKA Explorer. So, you can see that the attribute "Available" occurs just in one message, which happens to be a legitimate (ham) one:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh3.googleusercontent.com/-35XJu0ccyLs/UQey955xiLI/AAAAAAAABkM/QAikmabxlU0/s735/wekaexample04.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;Now, we can make our experiments in the Classify tab. We can just select cross-validation using 3 folds (1), point to the appropriate attribute to be used as a class (which is the "spamclass" one) (2), and select a rule learner like &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/rules/PART.html" target="_blank"&gt;PART&lt;/a&gt; in the classifier area (3). You can find that classifier at the "weka &amp;gt; classifiers &amp;gt; rules" folder when clicking on the "Choose" button at the "Classifier" area. This setup is shown in the next figure:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh4.googleusercontent.com/-7EPITpS_vNo/UQey-cGsD-I/AAAAAAAABkM/jOlj3LUM2OU/s735/wekaexample05.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 299px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="299" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;The selected evaluation method, &lt;a href="http://en.wikipedia.org/wiki/Cross-validation" target="_blank"&gt;cross-validation&lt;/a&gt;, instructs WEKA to divide the training collection into 3 sub-collections (folds), and perform three experiments. Each experiment is done by using two of the folds for training, and the remaining one for testing the learnt classifier. The sub-collections are sampled randomly, the way that each instance belong only to one of them, and the class distribution (50% in our example) is kept inside each fold.&lt;/p&gt;
&lt;p&gt;So, if we click on the "Start" button, we will get the output of our experiment, featuring the classifier learnt over the full collection, and the values for the typical accuracy metrics averaged over the three experiments, along with the confusion matrix. The classifier learnt over the full collection is the following one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;PART decision list
&lt;br/&gt;
------------------&lt;/p&gt;
&lt;p&gt;or &amp;lt;= 0 AND
&lt;br/&gt;
to &amp;lt;= 0 AND
&lt;br/&gt;
2 &amp;lt;= 0: ham (119.0/3.0)&lt;/p&gt;
&lt;p&gt;£1000 &amp;lt;= 0 AND
&lt;br/&gt;
FREE &amp;lt;= 0 AND
&lt;br/&gt;
call &amp;lt;= 0 AND
&lt;br/&gt;
Reply &amp;lt;= 0 AND
&lt;br/&gt;
i &amp;lt;= 0 AND
&lt;br/&gt;
all &amp;lt;= 0 AND
&lt;br/&gt;
final &amp;lt;= 0 AND
&lt;br/&gt;
50 &amp;lt;= 0 AND
&lt;br/&gt;
mobile &amp;lt;= 0 AND
&lt;br/&gt;
ur &amp;lt;= 0 AND
&lt;br/&gt;
text &amp;lt;= 0: ham (26.0/2.0)&lt;/p&gt;
&lt;p&gt;i &amp;lt;= 0 AND
&lt;br/&gt;
all &amp;lt;= 0: spam (30.0/3.0)&lt;/p&gt;
&lt;p&gt;: ham (25.0/1.0)&lt;/p&gt;
&lt;p&gt;Number of Rules : 4&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;This notation can be read as:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;if&lt;/strong&gt; (("or" &lt;strong&gt;not in&lt;/strong&gt; message) &lt;strong&gt;and&lt;/strong&gt; ("to" &lt;strong&gt;not in&lt;/strong&gt; message) &lt;strong&gt;and&lt;/strong&gt; ("2" &lt;strong&gt;not in&lt;/strong&gt; message)) &lt;strong&gt;then&lt;/strong&gt; class(message) == ham
&lt;br/&gt;
...
&lt;br/&gt;
&lt;strong&gt;otherwise&lt;/strong&gt; class(message) == ham&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;And the confusion matrix is the next one:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;=== Confusion Matrix ===&lt;/p&gt;
&lt;p&gt;a b &amp;lt;-- classified as
&lt;br/&gt;
17 16 | a = spam
&lt;br/&gt;
12 155 | b = ham&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Which means that the PART learner is able to get 17+155 correct classifications, and it makes 12+16 mistakes. It leads to an accuracy of 86%.&lt;/p&gt;
&lt;p style="TEXT-ALIGN: center"&gt;&lt;strong&gt;&lt;em&gt;But we have done it wrong!&lt;/em&gt;&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Do you remember the "Available" token, which occurs only on one of the messages? In which fold is it? When it is on a training fold, we are using it for training (making the learner trying to generalize from a token that does not occur in the test collection). And when it is on the test collection, the learner should not even know about it! Moreover, what happens with attributes that are highly predictive for the full collection (according to their statistics when computing e.g. the &lt;a href="http://en.wikipedia.org/wiki/Information_gain_in_decision_trees" target="_blank"&gt;Information Gain&lt;/a&gt; metric)? They may have worse (or better) statistics when a subset of their occurrences is not seen, as they can be on the test collection!&lt;/p&gt;
&lt;p&gt;The right way to perform a correct text classification experiment with cross validation in WEKA is feeding the indexing process into the classifier itself, that is, chaining the indexing filter (StringToWordVector) and the learner, the way that we index and train for every sub-set in the cross-validation run. Thus, you have to use the &lt;a href="http://weka.sourceforge.net/doc/weka/classifiers/meta/FilteredClassifier.html" target="_blank"&gt;FilteredClassifier&lt;/a&gt; class provided by WEKA.&lt;/p&gt;
&lt;p&gt;In fact, this is not that difficult. Let us go back to the original test collection, which features two attributes: the message (as a string) and the class. Then you can go to the Classify tab, and choose the FilteredClassifier learner, which is available at the "weka &amp;gt; classifiers &amp;gt; meta", and shown in the next picture:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh6.googleusercontent.com/-5IfFFabokhY/UQey-YCXzSI/AAAAAAAABkM/HtGGQzUMED4/s738/wekaexample06.PNG" style="TEXT-ALIGN: center; WIDTH: 400px; DISPLAY: block; HEIGHT: 298px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="298" width="400"/&gt;&lt;/p&gt;
&lt;p&gt;Then you must choose the filter and the classifier you are going to apply to the collection, by clicking on the classifier name at the "Classifier" area. I choose StringToWordFilter and PART with their default options:&lt;/p&gt;
&lt;p&gt;&lt;img src="https://lh5.googleusercontent.com/-QtG8vqffTiA/UQey-5KnaXI/AAAAAAAABkM/4XSXu9Q_GJs/s465/wekaexample07.PNG" style="TEXT-ALIGN: center; WIDTH: 300px; DISPLAY: block; HEIGHT: 178px; MARGIN-LEFT: auto; MARGIN-RIGHT: auto" height="178" width="300"/&gt;&lt;/p&gt;
&lt;p&gt;If we now run our experiment with 3-fold cross-validation and the filtered classifier we have just configured, we get different results:&lt;/p&gt;
&lt;p&gt;=== Confusion Matrix ===&lt;/p&gt;
&lt;p&gt;a b &amp;lt;-- classified as
&lt;br/&gt;
13 20 | a = spam
&lt;br/&gt;
7 160 | b = ham&lt;/p&gt;
&lt;p&gt;For an accuracy of 86.5%, a bit better than the one obtained with the wrong setup. However, we catch 4 less spam messages, and the True Positive ratio goes down from 0.515 to 0.394. This setup is more realistic and it better mimics what will happen in the real world, in which we will find highly relevant but unseen events, and our statistics may change dramatically over time.&lt;/p&gt;
&lt;p&gt;So now we can run our experiment safely, as no unseen events will be used in the classification. Moreover, if we apply any kind of Information Theory based filter like e.g. ranking the attributes according to their Information Gain value, the statistics will be correct, as they will be based on the training set for each cross-validation run.&lt;/p&gt;
&lt;p&gt;Thanks for reading, and please feel free to leave a comment if you think I can improve this article!&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/V02FnAq627I" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/3503879997074045923/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=3503879997074045923&amp;isPopup=true" title="2 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3503879997074045923?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3503879997074045923?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/V02FnAq627I/text-mining-in-weka-chaining-filters.html" title="Text Mining in WEKA: Chaining Filters and Classifiers" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://lh6.googleusercontent.com/-X1T58FONe78/UQey9KzvS_I/AAAAAAAABkM/nn6PVpXg9J4/s72-c/wekaexample01.PNG" height="72" width="72" /><thr:total>2</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/01/text-mining-in-weka-chaining-filters.html</feedburner:origLink></entry><entry gd:etag="W/&quot;Ck4BQHk4eSp7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-2995929005382442432</id><published>2013-01-16T19:13:00.001+01:00</published><updated>2013-05-02T09:49:11.731+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T09:49:11.731+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Opensource" /><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="WEKA" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="Personal" /><category scheme="http://www.blogger.com/atom/ns#" term="Data Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>A note on WEKA limitations and big data</title><content type="html">&lt;p style="TEXT-ALIGN: center"&gt;&lt;img src="http://users.dsic.upv.es/~cferri/weka/weka.jpg" style="WIDTH: 283px; DISPLAY: inline; HEIGHT: 156px" height="30" width="28"/&gt;&lt;/p&gt;
&lt;p&gt;I love &lt;a href="http://en.wikipedia.org/wiki/Weka_(machine_learning)" target="_blank"&gt;WEKA&lt;/a&gt; since it was first introduced to me by my friend &lt;a href="http://orion.esp.uem.es/gsi/index.php/Enrique-Puertas.html" target="_blank"&gt;Enrique Puertas&lt;/a&gt; back in 1999, when he used it for programming a Usenet News client with spam filtering capabilities based on Machine Learning (what we usually call a &lt;a href="http://en.wikipedia.org/wiki/Bayesian_spam_filtering" target="_blank"&gt;bayesian spam filter&lt;/a&gt; now). I got impressed by its flexibility and functionality, and the ease of experimenting with WEKA and using it in my Java programs. I quickly got familiar with it and I used it for making &lt;a href="https://www.aclweb.org/anthology-new/W/W00/W00-0719.pdf"&gt;my very first experiments on spam filtering&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Over the years, WEKA has being updated, getting more algorithms and making some tasks easier for text miners. For instance, the &lt;a href="http://weka.sourceforge.net/doc.dev/weka/filters/unsupervised/attribute/StringToWordVector.html" target="_blank"&gt;StringToWordVector filter&lt;/a&gt; allows to get a &lt;a href="http://en.wikipedia.org/wiki/Vector_space_model" target="_blank"&gt;Vector Space Model&lt;/a&gt; (or bag of words) representation of your problem texts, a task that I had to do manually (with my own programs or scripts) at the beginning. Another example: the &lt;a href="http://www.cs.waikato.ac.nz/ml/weka/arff.html"&gt;Sparse ARFF&lt;/a&gt; format allows to get a compact representation of your word vectors, instead of getting thousands of attribute values per instance, most of them being "0" or "no". Moreover, WEKA has attracted so much attention that other platforms have integrated it (e.g. &lt;a href="http://gate.ac.uk/" target="_blank"&gt;GATE&lt;/a&gt;) or provided covering environments that augment its functionality (e.g. &lt;a href="http://www.rapidminer.com/" target="_blank"&gt;RapidMiner&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;However, our needs as researchers have evolved as well. One of the most important issues now is data size. While working with average computers in my early experiments was enough, given the size of standard collections (&lt;a href="http://qwone.com/~jason/20Newsgroups/" target="_blank"&gt;20 Newsgroups&lt;/a&gt;, &lt;a href="http://www.daviddlewis.com/resources/testcollections/reuters21578/" target="_blank"&gt;Reuters-21578&lt;/a&gt;, &lt;a href="http://csmining.org/index.php/ling-spam-datasets.html" target="_blank"&gt;LingSpam&lt;/a&gt;, etc. - all of the order of tens of thousand instances), now that is nearly impossible. Most of my experiments involve from hundreds of thousand to millions of instances. In those cases, WEKA can spend days for a single learn-and-test cycle, or it can simply run out of memory; and not with an average machine, even with a really big server!&lt;/p&gt;
&lt;p&gt;So now, what?&lt;/p&gt;
&lt;p&gt;Before dealing with this question, I must say that I have been a heavy user of the WEKA &lt;em&gt;command line&lt;/em&gt; and the &lt;em&gt;&lt;a href="http://www.cse.yorku.ca/course_archive/2008-09/W/4412/ExplorerGuide.pdf" target="_blank"&gt;Explorer GUI&lt;/a&gt;&lt;/em&gt; . However, I have never considered or used the WEKA &lt;em&gt;&lt;a href="http://www.cse.yorku.ca/course_archive/2006-07/W/4412/doc/weka/ExperimenterTutorial-3.5.5.pdf" target="_blank"&gt;Experimenter GUI&lt;/a&gt;&lt;/em&gt; . I know from friends and diagonal readings that the Experimenter allows to distribute experiments over a number of machines. However, if I am going to distribute my experiments, why not using newer technologies (less ad-hoc, WEKA-dependent), just 100% compatible/standard/implemented with-in cloud providers? Why not getting advantage of elastic cloud capabilities (grow and pay as you need)?&lt;/p&gt;
&lt;p&gt;Given said this, and keeping up with the latest news and trends in data and text mining, I see two options:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Going for &lt;a href="http://www.r-project.org/" target="_blank"&gt;R&lt;/a&gt;&lt;/strong&gt; . This language/platform has grown incredibly in the latest years, and it has quickly became a standard language for data mining, present in many curricula, and much often considered an absolute requirement in data science job offers. There are nice books about it as well, like "&lt;a href="http://shop.oreilly.com/product/0636920022008.do" target="_blank"&gt;R in a Nutshell&lt;/a&gt;", and other strategical books recommend/use it (like "&lt;a href="http://www-stat.stanford.edu/~tibs/ElemStatLearn/" target="_blank"&gt;The Elements of Statistical Learning&lt;/a&gt;"). R supports map reduce algorithms over &lt;a href="http://hadoop.apache.org/" target="_blank"&gt;Hadoop&lt;/a&gt; for distributed experiments with tons of data. And R interfaces with Java as well.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Choosing &lt;a href="http://mahout.apache.org/" target="_blank"&gt;Mahout&lt;/a&gt;&lt;/strong&gt; (plus &lt;strong&gt;&lt;a href="http://lucene.apache.org/solr/" target="_blank"&gt;Lucene/SOLR&lt;/a&gt;&lt;/strong&gt; ). This platform is Java-based, tightly integrated with Hadoop, and it makes use of Lucene for text representation tasks -- Lucene could be considered a standard for deploying search engines now. There are good books on Mahout and Lucene/SOLR as well ("&lt;a href="http://manning.com/owen/" target="_blank"&gt;Mahout in Action&lt;/a&gt;", "&lt;a href="http://www.manning.com/hatcher3/" target="_blank"&gt;Lucene in Action&lt;/a&gt;", "&lt;a href="http://www.packtpub.com/solr-3-1-enterprise-search-server-cookbook/book" target="_blank"&gt;Apache SOLR Cookbook&lt;/a&gt;").&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;But still I do not feel any option is better than the other one. Both are challenging and appealing, and I have not taken a decision yet. And I am willing to hear your opinion, of course.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/NkQgRCzzjCI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/2995929005382442432/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=2995929005382442432&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/2995929005382442432?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/2995929005382442432?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/NkQgRCzzjCI/a-note-on-weka-limitations-and-big-data.html" title="A note on WEKA limitations and big data" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/01/a-note-on-weka-limitations-and-big-data.html</feedburner:origLink></entry><entry gd:etag="W/&quot;C0EBR385cSp7ImA9WhNUGEk.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-6600113380803451200</id><published>2013-01-10T19:14:00.000+01:00</published><updated>2013-01-10T19:14:16.129+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-10T19:14:16.129+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="Social Networks" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Opinion Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>A list of datasets for opinion mining in Twitter</title><content type="html">&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;
&lt;div style="text-align: left;"&gt;
In a recent thread at the &lt;a href="http://tech.groups.yahoo.com/group/SentimentAI/" target="_blank"&gt;SentimentAI group (list)&lt;/a&gt;, a number of links to datasets for training / testing opinion mining / sentiment classifiers over Twitter have been contributed. I list them here for the case somebody considers this information useful:&lt;/div&gt;
&lt;div style="text-align: left;"&gt;
&lt;/div&gt;
&lt;ul style="text-align: left;"&gt;
&lt;li&gt;&lt;a href="http://www.tweenator.com/index.php?page_id=8" target="_blank"&gt;Three datasets&lt;/a&gt; provided by&amp;nbsp;Hassan Saif, including an annotated subset of the&amp;nbsp;&lt;b&gt;Stanford Twitter Sentiment Corpus&lt;/b&gt;, and two for the specific topics of the&amp;nbsp;&lt;b&gt;Health Care Reform&lt;/b&gt; and the&amp;nbsp;&lt;b&gt;Obama-McCain Debate&lt;/b&gt;.&lt;/li&gt;
&lt;li&gt;The &lt;a href="http://help.sentiment140.com/for-students" target="_blank"&gt;&lt;b&gt;Stanford Twitter Corpus&lt;/b&gt;&lt;/a&gt; itself, provided by Alec Go and others at &lt;a href="http://www.sentiment140.com/" target="_blank"&gt;Sentiment140&lt;/a&gt;. You can download the &lt;a href="http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip" target="_blank"&gt;ST Corpus directly&lt;/a&gt;&amp;nbsp;(70Mb).&lt;/li&gt;
&lt;li&gt;The &lt;b&gt;&lt;a href="http://www.sananalytics.com/lab/twitter-sentiment/" target="_blank"&gt;Sanders Analytics&amp;nbsp;Twitter Sentiment Corpus&lt;/a&gt;&lt;/b&gt;, provided by&amp;nbsp;Niek Sanders.&lt;/li&gt;
&lt;li&gt;The &lt;b&gt;&lt;a href="http://nibir.me/projects/mejaj/datasets.html" target="_blank"&gt;mejaj datasets&lt;/a&gt;&lt;/b&gt;, provided by&amp;nbsp;&lt;a href="http://nibir.me/" target="_blank"&gt;Nibir Bora&lt;/a&gt; and others.&lt;/li&gt;
&lt;li&gt;The&amp;nbsp;&lt;b&gt;&lt;a href="http://www.cs.york.ac.uk/semeval-2013/task2/" target="_blank"&gt;SemEval-2013: Sentiment Analysis in Twitter&lt;/a&gt;&lt;/b&gt; evaluation campaign (or competition) dataset. &lt;i&gt;Note the competition is still active&lt;/i&gt;, you can join it! Check the dates at the &lt;a href="http://www.cs.york.ac.uk/semeval-2013/index.php?id=call-for-participation" target="_blank"&gt;SemEval-2013 website&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;The&amp;nbsp;&lt;a href="http://www.limosine-project.eu/events/replab2012#Profiling_task" style="font-weight: bold;" target="_blank"&gt;RepLab 2012 Profiling task dataset&lt;/a&gt;. The profiling task is a bit different from the standard sentiment classification task. For instance, factual tweets can imply bad reputation ("Lehmann Brothers goes bankrupt") and negative sentiment tweets can imply good reputation ("R.I.P. Michael Jackson. We'll miss you").&lt;/li&gt;
&lt;/ul&gt;
You can find the &lt;a href="http://tech.groups.yahoo.com/group/SentimentAI/message/589" target="_blank"&gt;SentimentAI thread on Twitter datasets here&lt;/a&gt;.&lt;div style="text-align: left;"&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/AZ7ew6cEmOk" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/6600113380803451200/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=6600113380803451200&amp;isPopup=true" title="1 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6600113380803451200?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6600113380803451200?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/AZ7ew6cEmOk/a-list-of-datasets-for-opinion-mining.html" title="A list of datasets for opinion mining in Twitter" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>1</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/01/a-list-of-datasets-for-opinion-mining.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkAMQH05cCp7ImA9WhBUFEU.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-8962168005477565815</id><published>2013-01-08T16:55:00.000+01:00</published><updated>2013-05-02T10:53:01.328+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-02T10:53:01.328+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Redes Sociales" /><category scheme="http://www.blogger.com/atom/ns#" term="Imagen" /><category scheme="http://www.blogger.com/atom/ns#" term="Seguridad" /><title>Spam en LinkedIn al estilo "Robin Sage"</title><content type="html">&lt;div style="TEXT-ALIGN: left" dir="ltr"&gt;&lt;a href="http://es.linkedin.com/in/jmgomezh/"&gt;Yo mismo&lt;/a&gt;, y algunos de mis contactos en &lt;a href="http://www.linkedin.com/"&gt;LinkedIn&lt;/a&gt;, han recibido recientemente una solicitud de contacto por parte de una tal "Elena Domínguez" (&lt;a href="http://www.linkedin.com/pub/elena-domínguez/62/196/45"&gt;enlace&lt;/a&gt;*). Se trata de un perfil un poco extraño, por cuanto está bastante poco detallado (experiencia profesional, formación, etc.), pero pertenece a varios grupos de ingenieros (se auto-califica como ingeniera), pero tiene cientos de contactos sumamente heterogéneos de temas TIC. Ésta es la imagen del perfil:
&lt;br/&gt;
&lt;br/&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;a style="MARGIN-LEFT: 1em; MARGIN-RIGHT: 1em" href="http://2.bp.blogspot.com/-qjm7fL1Xjik/UOw5gHLfkcI/AAAAAAAABfk/l74SI_V1uiE/s1600/Dibujo.bmp"&gt;&lt;img src="http://2.bp.blogspot.com/-qjm7fL1Xjik/UOw5gHLfkcI/AAAAAAAABfk/l74SI_V1uiE/s1600/Dibujo.bmp" height="266" border="0" width="320"/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;br/&gt;
Si se acepta a esta "persona", en pocos días (u horas), se recibirá un correo invitando a unirse al grupo de LinkedIn "&lt;strong&gt;International Master's in Theoretical &amp;amp; Practical Application of Finite Element Method&lt;/strong&gt;" (&lt;a href="http://www.linkedin.com/groups?home=&amp;amp;gid=3808981&amp;amp;trk=anet_ug_hm&amp;amp;goback=.con.npv_221408693_*1_*1_name_DGj4_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1_*1"&gt;enlace&lt;/a&gt;*). Aunque el master promocionado mediante este grupo de LinkedIn parece razonablemente legítimo, tanto el perfil como el grupo parecen ser spam.
&lt;br/&gt;
&lt;br/&gt;
Una cosa que llama especialmente la atención es que &lt;strong&gt;su foto de perfil&lt;/strong&gt; es bastante rara, como "demasiado aséptica", casi artificial. Una evidencia adicional de spam la obtenemos cuando realizamos una búsqueda por imágenes en Google, usando esta imagen como consulta. Primero obtenemos la URL de la imagen:
&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;a style="MARGIN-LEFT: 1em; MARGIN-RIGHT: 1em" href="http://1.bp.blogspot.com/-KOOz6DMzjJ8/UOw80IP2MQI/AAAAAAAABf0/3ZH876GQGzM/s1600/Dibujo2.bmp"&gt;&lt;img src="http://1.bp.blogspot.com/-KOOz6DMzjJ8/UOw80IP2MQI/AAAAAAAABf0/3ZH876GQGzM/s1600/Dibujo2.bmp" height="225" border="0" width="320"/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;br/&gt;&lt;/div&gt;
&lt;div style="TEXT-ALIGN: left; CLEAR: both" class="separator"&gt;A continuación, buscamos la foto en Google Images, pulsando sobre el botón de la cámara e introduciendo la URL que hemos obtenido antes:&lt;/div&gt;
&lt;div style="TEXT-ALIGN: left; CLEAR: both" class="separator"&gt;&lt;br/&gt;&lt;/div&gt;

&lt;br/&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;a style="MARGIN-LEFT: 1em; MARGIN-RIGHT: 1em" href="http://4.bp.blogspot.com/-PBWpSku6d1w/UOw80N-kmmI/AAAAAAAABf4/JMqkYbXOUpk/s1600/Dibujo3.bmp"&gt;&lt;img src="http://4.bp.blogspot.com/-PBWpSku6d1w/UOw80N-kmmI/AAAAAAAABf4/JMqkYbXOUpk/s1600/Dibujo3.bmp" height="141" border="0" width="320"/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;br/&gt;&lt;/div&gt;
&lt;div style="TEXT-ALIGN: left; CLEAR: both" class="separator"&gt;Y los resultados son los siguientes:&lt;/div&gt;
&lt;div style="TEXT-ALIGN: left; CLEAR: both" class="separator"&gt;&lt;br/&gt;&lt;/div&gt;

&lt;br/&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;a style="MARGIN-LEFT: 1em; MARGIN-RIGHT: 1em" href="http://1.bp.blogspot.com/-gu3D7uEhuTk/UOw80TB9cBI/AAAAAAAABf8/rm1m3Hq9p8Q/s1600/Dibujo4.bmp"&gt;&lt;img src="http://1.bp.blogspot.com/-gu3D7uEhuTk/UOw80TB9cBI/AAAAAAAABf8/rm1m3Hq9p8Q/s1600/Dibujo4.bmp" height="320" border="0" width="297"/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="TEXT-ALIGN: center; CLEAR: both" class="separator"&gt;&lt;br/&gt;&lt;/div&gt;
A partir de estos resultados, se puede deducir con bastante certeza que la foto es de "stock", es decir, de catálogo, y que aparece en varios catálogos como imagen de archivo de mujer de negocios con expresión neutra, realizada en estudio. Usar una foto como esta para nuestro perfil en una red como LinkedIn es posible, pero bastante poco probable.
&lt;br/&gt;
&lt;br/&gt;
Por tanto, considero esta fotografía como una evidencia fuerte que, unida al comportamiento del "usuario" (enviando el correo de invitación a un grupo tan focalizado en un producto educativo) como al número tan alto de contactos para un perfil tan poco detallado), me lleva a pensar que se trata de un perfil de spam, pero real en el sentido de que no es un experimento de ingeniería social como el realizado por &lt;a href="http://www.thomasryan.net/"&gt;&lt;strong&gt;Thomas Ryan&lt;/strong&gt;&lt;/a&gt; con el perfil " &lt;a href="http://www.networkworld.com/news/2010/070810-the-robin-sage-experiment-fake.html"&gt;&lt;strong&gt;Robin Sage&lt;/strong&gt;&lt;/a&gt; ".
&lt;br/&gt;
&lt;br/&gt;
Como conclusión, pienso que hasta LinkedIn, que es una de las redes menos explotadas para el spam, se irá viendo invadida crecientemente por este fenómeno, cada vez con mayor nivel de personalización y sofisticación.
&lt;br/&gt;
&lt;br/&gt;
(*) No asocio el enlace al nombre del perfil o del grupo para no generar spam web.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/IzNZctRBCSM" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/8962168005477565815/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=8962168005477565815&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/8962168005477565815?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/8962168005477565815?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/IzNZctRBCSM/spam-en-linkedin-al-estilo-robin-sage.html" title="Spam en LinkedIn al estilo &amp;quot;Robin Sage&amp;quot;" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-qjm7fL1Xjik/UOw5gHLfkcI/AAAAAAAABfk/l74SI_V1uiE/s72-c/Dibujo.bmp" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2013/01/spam-en-linkedin-al-estilo-robin-sage.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkADQn4zeip7ImA9WhNXFk4.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-3083779076099087003</id><published>2012-12-04T15:16:00.004+01:00</published><updated>2012-12-04T15:19:33.082+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-12-04T15:19:33.082+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Security" /><category scheme="http://www.blogger.com/atom/ns#" term="Events" /><category scheme="http://www.blogger.com/atom/ns#" term="Parental Control" /><category scheme="http://www.blogger.com/atom/ns#" term="Children Protection" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Filtering" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Report on ERA Course: Fighting Child Pornography on the Internet</title><content type="html">&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-jZD6Qn-DhZQ/UL3zdc73yeI/AAAAAAAABbM/W0ZHb3awEXw/s1600/424898_4622121118371_313664852_n.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-jZD6Qn-DhZQ/UL3zdc73yeI/AAAAAAAABbM/W0ZHb3awEXw/s1600/424898_4622121118371_313664852_n.jpg" height="320" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: center;"&gt;
&lt;br /&gt;&lt;/div&gt;
I have had the pleasure of attending as a student to the &lt;a href="http://www.era.int/"&gt;European Academy of Law&lt;/a&gt; course on "&lt;a href="https://www.era.int/cgi-bin/cms?_SID=6520b7451e95482bd8da749563e3306207b9af0900219030656915&amp;amp;_sprache=en&amp;amp;_bereich=artikel&amp;amp;_aktion=detail&amp;amp;idartikel=123272" target="_blank"&gt;Fighting Child Pornography on the Internet&lt;/a&gt;", at Madrid 29-30  November 2012. I was supported by the Spanish child protection NGO &lt;a href="http://www.protegeles.com/" target="_blank"&gt;Protégeles&lt;/a&gt;, as I work with  then whenever I can in order to push their mission.&lt;br /&gt;
&lt;br /&gt;
It was a nice course, with a good coverage of topics, including legal  aspects, and technical issues both from the view of prosecuting sex offenders  and from Web filtering. Speakers were excellent and provided a lok of useful  hints and links. I also crafted a backlog hashtag for the event in Twitter &lt;a href="https://twitter.com/search?q=#ERAChildPornCourse&amp;amp;src=hash" target="_blank"&gt;(#ERAChildPornCourse&lt;/a&gt;), but I am afraid that neither attendents  nor speakers are very happy with Twitter (with scarce exceptions). I collected  some comments during the event, organized in terms of the topic:&lt;br /&gt;
&lt;br /&gt;
&lt;strong&gt;Legal issues&lt;/strong&gt;&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;Media types that do not involve real children are child porn?  &lt;/li&gt;
&lt;li&gt;Internet and digital cameras have led to an explosion of child porn, now a  home industry  &lt;/li&gt;
&lt;li&gt;There is a thousand years history on child porn (e.g. paintings) but cameras  imply children are really abused to get it recorded  &lt;/li&gt;
&lt;li&gt;What does mean child porn possesion? What about cloud drives? And streaming?   &lt;/li&gt;
&lt;li&gt;Internet is world-wide, so who has the jurisdiction? Should anybody have it?   &lt;/li&gt;
&lt;li&gt;Eurojust helps coordination of child porn prosecution, examples of  operations: "lost boy", "nanny", "dreamboard"  &lt;/li&gt;
&lt;li&gt;Lanzarote Convention says accesing a child porn site, if knowing it hosts  that stuff, is illegal  &lt;/li&gt;
&lt;li&gt;Providing lists of links of web sites hosting child porn is illegal under  Lanzarote Convention&lt;/li&gt;
&lt;/ul&gt;
&lt;strong&gt;Protection, prosecution, technical issues&lt;/strong&gt;&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;For preparing cases against child porn, prosecutors check nature of  material, offender involvement and number of images  &lt;/li&gt;
&lt;li&gt;The 10% of photographs ever taken, were taken during the latest year Note:  all kind of pics  &lt;/li&gt;
&lt;li&gt;Groomers and child sex offenders play "the jailbait game" in vidro chat  sites  &lt;/li&gt;
&lt;li&gt;Youngsters are extemely vulnerable to grooming: they nearly accept all  frienship requests, have 3-4k+ contacts  &lt;/li&gt;
&lt;li&gt;Haebephilia is the sexual preference for individuals in early years of  puberty (generally 11-14)  &lt;/li&gt;
&lt;li&gt;LEAs make use of a plethora of image analysis tools to process suspect pics;  Microsoft PhotoDNA just one in the box  &lt;/li&gt;
&lt;li&gt;About 20% of child porn stuff is delivered through commercial platforms  &lt;/li&gt;
&lt;li&gt;Project HAVEN aims at stop child abuse by EU citizens in foreign countries  (Asia, South America...)  &lt;/li&gt;
&lt;li&gt;Law Enforcement Agencies cooperate and share a Child Abuse International  database  &lt;/li&gt;
&lt;li&gt;Law Enforcement Agencies (e.g. Europol) are getting more and more focused on  victim identification  &lt;/li&gt;
&lt;li&gt;INHOPE has not authority to release block lists of child porn sites  &lt;/li&gt;
&lt;/ul&gt;
An aditional fact is that after hearing Interpol and Europol, one gets proud  of having such great professionals working against child porn.&lt;br /&gt;
&lt;br /&gt;
All in all, it has been a great course and I am very happy of being able to  attend to it.&lt;/div&gt;
&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/MCC94-6BE5o" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/3083779076099087003/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=3083779076099087003&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3083779076099087003?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3083779076099087003?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/MCC94-6BE5o/i-have-had-pleasure-of-attending-as.html" title="Report on ERA Course: Fighting Child Pornography on the Internet" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-jZD6Qn-DhZQ/UL3zdc73yeI/AAAAAAAABbM/W0ZHb3awEXw/s72-c/424898_4622121118371_313664852_n.jpg" height="72" width="72" /><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2012/12/i-have-had-pleasure-of-attending-as.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkECQX86eip7ImA9WhVUEkg.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7050047208549386874</id><published>2012-05-17T12:31:00.000+02:00</published><updated>2012-05-17T12:31:00.112+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-17T12:31:00.112+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Tecnología" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluacion" /><category scheme="http://www.blogger.com/atom/ns#" term="Captcha" /><category scheme="http://www.blogger.com/atom/ns#" term="Papers" /><category scheme="http://www.blogger.com/atom/ns#" term="Imagen" /><category scheme="http://www.blogger.com/atom/ns#" term="Personal" /><category scheme="http://www.blogger.com/atom/ns#" term="Internet" /><category scheme="http://www.blogger.com/atom/ns#" term="Seguridad" /><title>Artículo en Novática: comprometiendo la seguridad de reCAPTCHA</title><content type="html">&lt;p&gt;En el número 215 de &lt;a href="http://www.ati.es/novatica/" target="_blank"&gt;Novática&lt;/a&gt; hemos publicado un artículo que versa sobre la utilización de diversas técnicas de normalización de imagen y el OCR Tesseract de Google para realizar ataques de reconocimiento de texto sobre dos versiones de reCAPTCHA. La referencia del artículo es:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Noemí Carranza, Ricardo Palma Durán, Gonzalo Álvarez Marañón, &lt;em&gt;José María Gómez Hidalgo&lt;/em&gt;, 2012. &lt;strong&gt;&lt;a href="http://www.ati.es/novatica/2012/215/nv215sum.html#art43" target="_blank"&gt;Análisis de la seguridad del sistema reCAPTCHA&lt;/a&gt;&lt;/strong&gt;. &lt;a href="http://www.ati.es/novatica/" target="_blank"&gt;Revista Novática&lt;/a&gt; 215, enero-febrero 2012, pág. 43-48.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;El resumen del artículo es el siguiente:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;En los últimos tiempos se han popularizado extraordinariamente los sistemas CAPTCHA, que protegen servicios Web planteando al usuario una prueba destinada a verificar que se trata de un ser humano y no de un robot, o sistema automático para el envío de correo basura o difusión de malware. Estos sistemas están siempre expuestos a que spammers y hackers sean capaces de comprometer su seguridad, y abusar de los recursos subyacentes (cuentas de correo, blogs, etc.) para realizar sus actividades ilícitas. Por ello, es necesario comprobar periódicamente su seguridad usando herramientas como sistemas de reconocimiento óptico de caracteres (OCR), sistemas de análisis de imagen, y otras. En este artículo realizamos un análisis de la seguridad del sistema reCAPTCHA, que probablemente es el más usado en Internet actualmente. Para ello, utilizamos diversas técnicas de análisis de imagen orientadas
&lt;br/&gt;
a corregir las deformaciones y distorsiones realizadas por el sistema en las imágenes que muestra al usuario, así como el eficaz sistema de OCR Tesseract. Se han analizado dos versiones del sistema reCAPTCHA y se ha comprobado que la seguridad del sistema probablemente ha aumentado en la segunda versión, más reciente, aunque es posible comprometer la seguridad del sistema si se cuenta con recursos suficientes en forma de una botnet de tamaño medio (unos 10.000 ordenadores).&lt;/p&gt;
&lt;/blockquote&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/gHaPhyNGv_M" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7050047208549386874/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7050047208549386874&amp;isPopup=true" title="2 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7050047208549386874?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7050047208549386874?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/gHaPhyNGv_M/articulo-en-novatica-comprometiendo-la.html" title="Artículo en Novática: comprometiendo la seguridad de reCAPTCHA" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>2</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2012/05/articulo-en-novatica-comprometiendo-la.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D08AQXk7eyp7ImA9WhVVE0s.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-39634937118164987</id><published>2012-05-07T06:44:00.000+02:00</published><updated>2012-05-07T06:44:00.703+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-05-07T06:44:00.703+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Imaging" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommender Systems" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>In-progress competitions in Data and Text Mining</title><content type="html">&lt;p&gt;The following competitions in Data and Text Mining are open to participation:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="http://www.multimediaeval.org/" target="_blank"&gt;MediaEval 2012 - Multimedia Benchmark Evaluation&lt;/a&gt; - Several tasks including: Placing Task, Social Event Detection Task, Spoken Web Search Task, Tagging Task, Affect Task: Violent Scenes Detection, Visual Privacy Task.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.kaggle.com/c/msdchallenge" target="_blank"&gt;Million Song Dataset Challenge&lt;/a&gt;: a large-scale, open evaluation of personalized music recommendation algorithms.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://clef2012.org/" target="_blank"&gt;CLEF 2012&lt;/a&gt;: Several tasks related to Information Access Evaluation in Multilinguality, Multimodality, and Visual Analytics, including:&lt;/li&gt;
&lt;li style="LIST-STYLE-TYPE: none"&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="http://www.promise-noe.eu/chic-2012/home" target="_blank"&gt;CHiC&lt;/a&gt; : Cultural Heritage in CLEF, a pilot evaluation lab that aims at moving towards a systematic and large-scale evaluation of cultural heritage digital libraries and information access systems.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://ifs.tuwien.ac.at/~clef-ip/" target="_blank"&gt;CLEF-IP&lt;/a&gt;: IR in the patents domain, which provides a large collection of XML documents representing patents and patent images.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.imageclef.org/" target="_blank"&gt;ImageCLEF&lt;/a&gt;: Cross Language Image Retrieval, which evaluates the cross-language annotation and retrieval of images by focusing on the combination of textual and visual evidence.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://inex.mmci.uni-saarland.de/" target="_blank"&gt;INEX&lt;/a&gt; : INitiative for the Evaluation of XML Retrieval&lt;/li&gt;
&lt;li&gt;&lt;a href="http://pan.webis.de/" target="_blank"&gt;PAN&lt;/a&gt;: Uncovering Plagiarism, Authorship, and Social Software Misuse. This includes the sexual predator identification task.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://celct.fbk.eu/QA4MRE/" target="_blank"&gt;QA4MRE&lt;/a&gt;: Question Answering for Machine Reading Evaluation, its goal is to evaluate Machine Reading abilities through Question Answering and Reading Comprehension Tests.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.limosine-project.eu/events/replab2012" target="_blank"&gt;RepLab&lt;/a&gt;: An Evaluation Campaign for Online Reputation Management Systems.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="http://games.ws.dei.polimi.it/competitions/scr/" target="_blank"&gt;Simulated car racing championship&lt;/a&gt;: its goal is to design a controller for a racing car that will compete on a set of unknown tracks first alone (against the clock) and then against other drivers.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://pro.europeana.eu/web/guest/hack4europe-2012" target="_blank"&gt;Europeana Hack4Europe! 2012&lt;/a&gt;: inviting developers and designers to try out their ideas for creative re-use of the Europeana content and build applications showcasing the social and business value of open cultural data.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://explochallenge.inria.fr/" target="_blank"&gt;Yahoo! News&lt;/a&gt;: ICML exploration and exploitation challenge on recommending Yahoo! news item using click logs.&lt;/li&gt;
&lt;/ul&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/A7a3Rp96458" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/39634937118164987/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=39634937118164987&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/39634937118164987?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/39634937118164987?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/A7a3Rp96458/in-progress-competitions-in-data-and.html" title="In-progress competitions in Data and Text Mining" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2012/05/in-progress-competitions-in-data-and.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkIAQXczeyp7ImA9WhVTFE0.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7728517665240036340</id><published>2012-02-28T05:49:00.000+01:00</published><updated>2012-02-28T05:49:00.983+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-02-28T05:49:00.983+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Phishing" /><category scheme="http://www.blogger.com/atom/ns#" term="CFP" /><category scheme="http://www.blogger.com/atom/ns#" term="Security" /><category scheme="http://www.blogger.com/atom/ns#" term="Social Networks" /><category scheme="http://www.blogger.com/atom/ns#" term="Events" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Virus" /><category scheme="http://www.blogger.com/atom/ns#" term="Privacy" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>CFP: RECSI 2012 : XII Spanish Meeting on Criptology and Information Security</title><content type="html">&lt;p&gt;&lt;strong&gt;&lt;a href="http://recsi2012.mondragon.edu/en?set_language=en" target="_blank"&gt;&lt;strong&gt;XII Spanish Meeting on Criptology and Information Security - RECSI 2012
&lt;br/&gt;&lt;/strong&gt;&lt;/a&gt;&lt;/strong&gt;Donostia-San Sebastian, Spain
&lt;br/&gt;
September 4-7, 2012&lt;/p&gt;
&lt;p&gt;The Information Society is a reality involving principal actors: Companies, Government, Scientific and Education Agents and Society as a whole. Scientific actors provide knowledge, that, with the support of the Government and a transformation to technology by Companies, must respond to the demand of the society in short, medium and long term. Computer Security is needed to ensure the correct use of Information Technology and Communication by Society.&lt;/p&gt;
&lt;p&gt;The Spanish Meeting on Cryptology and Information Security (RECSI) is the Spanish reference scientific Conference in the field. Every two years the most important national researchers attend this event, and renowned foreign researchers are invited. In the meeting recent advances in security will be presented to the main agents of the Information Society, in order to provide a forum for exchanging ideas, increase knowledge and share experiences in the field of Security.&lt;/p&gt;
&lt;p&gt;For this reason, Mondragon University is organizing the RECSI in order to bring together most of the research groups and the most important companies that work in: design of methods to protect the information (cryptography), analyze these methods to discover vulnerabilities (cryptanalysis), and create and analyze methods for the protection of computer systems and communication networks (Information Security).&lt;/p&gt;
&lt;p&gt;In September 2012, its twelfth edition will take place in Donostia-San Sebastián. Previous editions were held in Palma de Mallorca (1991), Madrid (1992), Barcelona (1994), Valladolid (1996), Torremolinos (1998), Santa Cruz de Tenerife (2000), Oviedo (2002), Leganés (2004), Barcelona (2006), Salamanca (2008) and Tarragona (2010).&lt;/p&gt;
&lt;p&gt;All papers about information security are relevant, and selected topics include:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Cryptography and cryptoanalysis&lt;/li&gt;
&lt;li&gt;Authentication and digital signatures&lt;/li&gt;
&lt;li&gt;Cryptography Applications&lt;/li&gt;
&lt;li&gt;Privacy and Anonymity&lt;/li&gt;
&lt;li&gt;Watermarking and steganography&lt;/li&gt;
&lt;li&gt;Access Control&lt;/li&gt;
&lt;li&gt;Intrusion detection and traps&lt;/li&gt;
&lt;li&gt;Malware Analysis&lt;/li&gt;
&lt;li&gt;Spam detection&lt;/li&gt;
&lt;li&gt;Security in Social Networks&lt;/li&gt;
&lt;li&gt;Security in Embedded Systems&lt;/li&gt;
&lt;li&gt;Computer Forensics&lt;/li&gt;
&lt;/ul&gt;
&lt;p style="TEXT-ALIGN: left"&gt;&lt;strong&gt;Important Dates&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;div style="TEXT-ALIGN: left"&gt;Deadline for paper submission: April 2, 2012&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div style="TEXT-ALIGN: left"&gt;Acceptance notification: May 1, 2012&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div style="TEXT-ALIGN: left"&gt;Conference: September 4-7, 2012&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/qZZmeBrsxxU" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7728517665240036340/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7728517665240036340&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7728517665240036340?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7728517665240036340?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/qZZmeBrsxxU/cfp-recsi-2012-xii-spanish-meeting-on.html" title="CFP: RECSI 2012 : XII Spanish Meeting on Criptology and Information Security" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2012/02/cfp-recsi-2012-xii-spanish-meeting-on.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CkEBQ3s7eSp7ImA9WhRbGEo.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-5212883425709041998</id><published>2012-02-10T06:46:00.000+01:00</published><updated>2012-02-10T11:44:12.501+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-02-10T11:44:12.501+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="CFP" /><category scheme="http://www.blogger.com/atom/ns#" term="Events" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="Children Protection" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Filtering" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>PAN 2012 Author Identification - Sexual Predator task, references and resources</title><content type="html">&lt;p&gt;The &lt;a href="http://pan.webis.de/" target="_blank"&gt;&lt;strong&gt;PAN Labs (Uncovering Plagiarism, Authorship, and Social Software Misuse)&lt;/strong&gt;&lt;/a&gt; is a series of scientific competitions that have been performed during the recent years, focused on applying automated text analysis to the detection of &lt;strong&gt;plagiarism, authorship attribution&lt;/strong&gt;, and related tasks. Unlike other, more traditional text classification tasks like Text Categorization, the problems are modeled using &lt;strong&gt;style attributes&lt;/strong&gt; (instead of content words), like frequencies of particular syntactic tags, specific collocations, approximate string matching, etc. As in other scientific competitions, the organizers provide a labeled tratining set of texts in order to refine both the input/output format and the algorithms, and the participants are required to run their software on a test set with unknown labels.&lt;/p&gt;
&lt;p&gt;The PAN 2012 Lab will be held in Rome in September in conjunction with the &lt;a href="http://clef2012.org/" target="_blank"&gt;CLEF 2012 conference&lt;/a&gt;. It features three tasks, being one of them &lt;a href="http://www.uni-weimar.de/medien/webis/research/events/pan-12/pan12-web/authorship.html" target="_blank"&gt;Author Identification&lt;/a&gt;. This task focuses on &lt;strong&gt;identifying sexual predators in chat logs&lt;/strong&gt;, and on authorship verification. The training data will be released on Mar 16, 2012.&lt;/p&gt;
&lt;p&gt;For those willing to participate in this competition, I provide a series of resources that may help them.&lt;/p&gt;
&lt;p&gt;First, the &lt;a href="http://perverted-justice.com/index.php" target="_blank"&gt;&lt;strong&gt;Perverted Justice&lt;/strong&gt;&lt;/a&gt; website, run by the &lt;a href="http://www.pjfi.org/" target="_blank"&gt;Perverted Justice Foundation Inc.&lt;/a&gt;, features a big number of English-language chat logs from real sexual predators talking to volunteers acting as female youngsters. These archives are public and legal according to USA Laws. There is no danger on using them for research purposes.&lt;/p&gt;
&lt;p&gt;Secondly, here is a &lt;strong&gt;list of papers&lt;/strong&gt; that may be of interest for those willing to prepare an algorithm or system to join the PAN 2012 author identification task on detecting sexual predators. Not all of them are related with sexual predators, but with other child security problems in the Internet like cyberbullying as well:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Myriam Munezero, Tuomo Kakkonen and Calkin Montero, "Towards automatic detection of antisocial behavior from texts", IJCNLP 2011 Proceedings of the Workshop on Sentiment Analysis where AI meets Psychology (SAAIP). &lt;a href="http://www.ijcnlp2011.org/proceeding/workshop/WS8_SAAIP/SAAIP-2011.pdf"&gt;http://www.ijcnlp2011.org/proceeding/workshop/WS8_SAAIP/SAAIP-2011.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;McGhee, India, Jennifer Bayzick, April Kontostathis, Lynne Edwards, Alexandra McBride, and Emma Jakubowski. (2011). &lt;a href="http://webpages.ursinus.edu/akontostathis/KontostathisMSMSubmittedNov2010.pdf"&gt;Learning to Identify Internet Sexual Predation&lt;/a&gt;. International Journal on Electronic Commerce. Volume 15, Number 3. Spring 2011&lt;/p&gt;
&lt;p&gt;Karthik Dinakar, Birago Jones, Catherine Havasi, Henry, Lieberman, Rosalind Picard, "TimeOut: Commonsense Reasoning for Detection, Prevention, and Mitigation of Cyberbullying" ACM Transactions on Interactive Intelligent Systems, 2011, &lt;a href="http://web.media.mit.edu/~lieber/Publications/Bullying-TiiS.pdf"&gt;http://web.media.mit.edu/~lieber/Publications/Bullying-TiiS.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Tibor Bosse and Sven Stam, "A Normative Agent System to Prevent Cyberbullying", In IEEE/WIC/ACM International Conferences on Web Intelligence and Intelligent Agent Technology 2011, &lt;a href="http://www.cs.vu.nl/~tbosse/papers/IAT11-cyberbullying.pdf"&gt;http://www.cs.vu.nl/~tbosse/papers/IAT11-cyberbullying.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Jennifer Bayzick, April Kontostathis and Lynne Edwards, "Detecting the Presence of Cyberbullying Using Computer Software", Poster presentation at WebSci11, June 14-17, 2011, Koblenz Germany. &lt;a href="http://www.websci11.org/fileadmin/websci/Posters/63_paper.pdf"&gt;http://www.websci11.org/fileadmin/websci/Posters/63_paper.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Dinakar K., Reichart R.,Lieberman, H., "Modeling the detection of textual cyberbullying", International Conference on Weblog and Social Media - Social Mobile Web Workshop, Barcelona, Spain 2011. &lt;a href="http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/paper/download/3841/4384"&gt;http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/paper/download/3841/4384&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Michal Ptaszynski, Pawel Dybala, Tatsuaki Matsuba, Fumito Masui, Rafal Rzepka, Kenji Araki, and Yoshio Momouchi, "In the Service of Online Order: Tackling Cyber-Bullying with Machine Learning and Affect Analysis", International Journal of Computational Linguistics Research, Vol. 1 , Issue 3, pp. 135-154, 2010. &lt;a href="http://arakilab.media.eng.hokudai.ac.jp/~ptaszynski/data/Ptaszynski_IJCLR2010-Cyberbullying_2011.02.23.pdf"&gt;http://arakilab.media.eng.hokudai.ac.jp/~ptaszynski/data/Ptaszynski_IJCLR2010-Cyberbullying_2011.02.23.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Michal Ptaszynski, Pawel Dybala, Tatsuaki Matsuba, Fumito Masui, Rafal Rzepka and Kenji Araki, "Machine Learning and Affect Analysis Against Cyber-Bullying", In Proceedings of The Thirty Sixth Annual Convention of the Society for the Study of Artificial Intelligence and Simulation of Behaviour (AISB'10), 29th March - 1st April 2010, De Montfort University, Leicester, UK, pp. 7-16, 2010. &lt;a href="http://arakilab.media.eng.hokudai.ac.jp/~ptaszynski/data/AISB2010_Cyberbullying_paper.pdf"&gt;http://arakilab.media.eng.hokudai.ac.jp/~ptaszynski/data/AISB2010_Cyberbullying_paper.pdf&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Kontostathis, April, Lynne Edwards, and Amanda Leatherman. (2009). &lt;a href="http://webpages.ursinus.edu/akontostathis/TextMining2009BookChapter.pdf"&gt;Text Mining and Cybercrime&lt;/a&gt; In Text Mining: Application and Theory. Michael W. Berry and Jacob Kogan, Eds., John Wiley &amp;amp; Sons, Ltd. 2009. &lt;a href="http://www.chatcoder.com/Data/TextMiningAndCyberCrimeData.rar"&gt;Link to data used in paper&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Kontostathis, April, Lynne Edwards, Jen Bayzick, India McGhee, Amanda Leatherman and Kristina Moore. (2009). &lt;a href="http://webpages.ursinus.edu/akontostathis/KontostathisMSMFinal.pdf"&gt;Comparison of Rule-based to Human Analysis of Chat Logs.&lt;/a&gt; 1st International Workshop on Mining Social Media (MSM09). Seville, Spain. Nov 2009.&lt;/p&gt;
&lt;p&gt;Kontostathis, April, Lynne Edwards, and Amanda Leatherman. (2009). &lt;a href="http://webpages.ursinus.edu/akontostathis/KontostathisTextMining2009Final.pdf"&gt;ChatCoder: Toward the Tracking and Categorization of Internet Predators.&lt;/a&gt; In Proc. Text Mining Workshop 2009 held in conjunction with the Ninth SIAM International Conference on Data Mining (SDM 2009). Sparks, NV. May 2009.&lt;/p&gt;
&lt;p&gt;D. Yin, Z. Xue, L. Hong, B. D. Davison, A. Kontostathis, and L. Edwards, "Detection of Harassment on Web 2.0", In CAW 2.0 '09: Proceedings of the 1st Content Analysis in Web 2.0 Workshop, Madrid, Spain, 2009. &lt;a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.151.8839&amp;amp;rep=rep1&amp;amp;type=pdf"&gt;http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.151.8839&amp;amp;rep=rep1&amp;amp;type=pdf&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/aJkeNrxLcEM" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/5212883425709041998/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=5212883425709041998&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/5212883425709041998?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/5212883425709041998?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/aJkeNrxLcEM/pan-2012-author-identification-sexual.html" title="PAN 2012 Author Identification - Sexual Predator task, references and resources" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2012/02/pan-2012-author-identification-sexual.html</feedburner:origLink></entry><entry gd:etag="W/&quot;A0ABRng6eCp7ImA9WhRREE4.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7760593993210653027</id><published>2011-11-23T10:13:00.000+01:00</published><updated>2011-11-23T10:15:57.610+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-11-23T10:15:57.610+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="Evaluation" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Biomedicine" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Datasets, databases and resources: MSH WSD, BioNOT, Gazetiki, DBpedia Spotlight, Google BigQuery, Common Crawl</title><content type="html">&lt;p&gt;Some datasets and resources I have recently found (although they may be old):&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;&lt;a href="http://wsd.nlm.nih.gov/collaboration.shtml"&gt;MSH WSD&lt;/a&gt;&lt;/strong&gt;: a data set for Word Sense Disambiguation WSD based on a method that can be used to automatically develop a WSD test collection using the Unified Medical Language System (UMLS) Metathesaurus and the manual MeSH indexing of MEDLINE.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a href="http://snake.ims.uwm.edu/bionot/index.php" target="_blank"&gt;BioNOT&lt;/a&gt;&lt;/strong&gt;: a searchable database of negated biomedical sentences. The database consists of more than 32 million negated sentences at PubMed.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a href="http://georama-project.labs.exalead.com/gazetiki.htm" target="_blank"&gt;Gazetiki&lt;/a&gt;&lt;/strong&gt;: a geographical database that contains 8323702 geographical names coming from Geonames and from different Web sources, with the latter representing over 1 million items, with the addition of a popularity score which was calculated based on the usage of a place name in a geotagged dataset.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a href="http://dbpedia.org/spotlight" target="_blank"&gt;DBpedia Spotlight&lt;/a&gt;&lt;/strong&gt;: a tool for automatically annotating mentions of DBpedia resources in text, providing a solution for linking unstructured information sources to the Linked Open Data cloud through DBpedia. DBpedia Spotlight performs named entity extraction, including entity detection and Name Resolution.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a href="http://code.google.com/apis/bigquery/" target="_blank"&gt;Google BigQuery Service&lt;/a&gt;&lt;/strong&gt;: a SQL-like tool for analyzing massive datasets, as a web service that enables you to do interactive analysis of massively large datasets-up to billions of rows.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.commoncrawl.org/" target="_blank"&gt;&lt;strong&gt;Common Crawl&lt;/strong&gt;&lt;/a&gt;: a freely accessible index of 5 billion web pages, their page rank, their link graphs and other metadata, hosted on Amazon EC2, was announced today by the Common Crawl Foundation.&lt;/li&gt;
&lt;/ul&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/sAc4kXljyUw" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7760593993210653027/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7760593993210653027&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7760593993210653027?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7760593993210653027?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/sAc4kXljyUw/datasets-databases-and-resources-msh.html" title="Datasets, databases and resources: MSH WSD, BioNOT, Gazetiki, DBpedia Spotlight, Google BigQuery, Common Crawl" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/11/datasets-databases-and-resources-msh.html</feedburner:origLink></entry><entry gd:etag="W/&quot;CEEFRHs-cSp7ImA9WhRSGEs.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-838585299454748513</id><published>2011-11-21T09:02:00.000+01:00</published><updated>2011-11-21T09:03:35.559+01:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-11-21T09:03:35.559+01:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Evaluacion" /><category scheme="http://www.blogger.com/atom/ns#" term="Recuperación de Información" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><title>Seminario MAVIR: Búsqueda de personas en la Web</title><content type="html">&lt;p&gt;&lt;strong&gt;TÍTULO&lt;/strong&gt;: Búsqueda de personas en la Web
&lt;br/&gt;
&lt;strong&gt;PONENTE&lt;/strong&gt;: Julio Gonzalo (NLP&amp;amp;IR-UNED)
&lt;br/&gt;
&lt;strong&gt;FECHA Y HORA&lt;/strong&gt;: miércoles 23 de noviembre de 2011, a las 12h00&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;ABSTRACT&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Buscar información sobre personas es una actividad frecuente en la Red, que va desde encontrar un número de teléfono hasta analizar la reputación online de una persona a partir de miles de fuentes. Pero utilizar un nombre de persona como consulta tiene un problema crucial: la ambigüedad. Los buscadores convencionales devuelven una lista de resultados en la que, muy a menudo, se mezcla información sobre muchas personas que se llaman igual. Un buscador ideal agruparía los resultados que se refieren a cada persona, y extraería dinámicamente un perfil de cada persona a partir del contenido de las páginas encontradas.&lt;/p&gt;
&lt;p&gt;Web People Search es el término acuñado para este reto, y una parte sustancial de la investigación en el tema se ha vertebrado en torno a las campañas de evaluación competitiva WePS. En esta charla resumiremos los progresos alcanzados hasta ahora, y utilizaremos WePS como un ejemplo del uso de campañas de evaluación como motor de la investigación en Tecnologías de la Lengua. Por el camino, nuestras reflexiones ayudarán a explicar porqué los espías se niegan a usar la medida F de Van Rijsbergen, y qué podrían usar a cambio.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;BIO&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Julio Gonzalo es coordinador del grupo de investigación NLP&amp;amp;IR de la UNED. Sus intereses de investigación se sitúan en la intersección entre el Procesamiento del Lenguaje Natural y la Recuperación de
&lt;br/&gt;
Información e incluyen, en particular, la búsqueda de información multilingüe e interactiva, la organización de resultados de búsqueda, la búsqueda semántica, y las métricas y metodologías de evaluación en
&lt;br/&gt;
estas áreas. Ha sido recientemente general co-chair del CLEF 2011, area chair de EACL 2012 y EMNLP 2010, y co-organizador de las campañas de evaluación WePS (2007, 2009 y 2010).&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;LUGAR DE CELEBRACIÓN&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Sala de Grados
&lt;br/&gt;
Escuela Politécnica Superior, UAM
&lt;br/&gt;
c/ Francisco Tomás y Valiente, 11
&lt;br/&gt;
Campus de Cantoblanco
&lt;br/&gt;
28049 Madrid&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;+ INFO&lt;/strong&gt;: &lt;a href="http://www.mavir.net/talks/100-jgonzalo-nov2011" target="_blank"&gt;http://www.mavir.net/talks/100-jgonzalo-nov2011&lt;/a&gt;&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/NuhrvJUVfyc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/838585299454748513/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=838585299454748513&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/838585299454748513?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/838585299454748513?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/NuhrvJUVfyc/titulo-busqueda-de-personas-en-la-web.html" title="Seminario MAVIR: Búsqueda de personas en la Web" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/11/titulo-busqueda-de-personas-en-la-web.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0YHRXo5fyp7ImA9WhdaF00.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-7773864689605700151</id><published>2011-10-27T10:45:00.001+02:00</published><updated>2011-10-27T10:45:34.427+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-10-27T10:45:34.427+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="CFP" /><category scheme="http://www.blogger.com/atom/ns#" term="Online Advertising" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="Internet" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>CFP: WWW 2012, Internet Monetization and Incentives Track</title><content type="html">&lt;p/&gt;
&lt;p&gt;&lt;strong&gt;Call for Papers: WWW 2012, Internet Monetization and Incentives Track&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;The Web has become as a major economic phenomenon, acting both as a conduit for traditional endeavourstransactions such as business-to-business and business-to-consumer commerce, and as an arena for a specific variety of other economic activities such as Web advertising, digital payment systems, and bandwidths provisioning. The WWW track on Internet Monetization is a forum for theoretical and applied research related to web-specific economic activities.&lt;/p&gt;
&lt;p&gt;The track will be interdisciplinary in nature. Relevant topics include (but are not limited to) :&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;div&gt;Computational advertising: sponsored search, content match, graphical ads delivery, targeting&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Machine learning and data mining applied to auction theory and user modeling in the context of Internet monetization&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Internet auctions, markets, and exchanges&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Economics aspects of online reviews, reputations, and ratings&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Monetizing digital media, user generated content, and the social web&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;User-experience design aspects of Web monetization mechanisms&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Web analytics for e-commerce&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Economics of information/digital goods&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Advertising infrastructure: tools, platforms, networks, exchanges, automation, audience intelligence&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Economic approaches to spam/fraud control&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Social and crowdsourcing commerce&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;E-commerce issues in cloud computing and and Web apps&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Mobile web advertising and locating-based e-commerce&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Important dates&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;All submission deadlines are at 9:00pm PST.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;div&gt;November 1st, 2011 Abstracts for papers due&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;November 7th, 2011 Papers due&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;January 30th, 2012 Paper notifications out&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;February 28th, 2012 Camera ready papers due&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;April 16th, 2012 Conference begins&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;All submissions must be formatted according to the ACM SIG Proceedings Template.&lt;/p&gt;
&lt;p&gt;Submission can be made at: &lt;a href="https://www.easychair.org/account/signin.cgi?conf=www2012"&gt;https://www.easychair.org/account/signin.cgi?conf=www2012&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Via &lt;a href="http://www.sigir.org/sigirlist/" target="_blank"&gt;SIGIRList&lt;/a&gt;.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/5q7kcZ_1904" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/7773864689605700151/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=7773864689605700151&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7773864689605700151?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/7773864689605700151?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/5q7kcZ_1904/cfp-www-2012-internet-monetization-and.html" title="CFP: WWW 2012, Internet Monetization and Incentives Track" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/10/cfp-www-2012-internet-monetization-and.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DUcMRns_fyp7ImA9WhdaEEg.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-492154649456286751</id><published>2011-10-19T22:44:00.001+02:00</published><updated>2011-10-19T22:44:47.547+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-10-19T22:44:47.547+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Papers" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="Personal" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>ACM Author-Izer: it is for you, it is for the ACM</title><content type="html">&lt;p&gt;A win-win, absoultely. The ACM has developed a new service, now live, named "&lt;strong&gt;&lt;a href="http://www.acm.org/publications/acm-author-izer-service" target="_blank"&gt;Author-Izer&lt;/a&gt;&lt;/strong&gt;". A relatively long explanation of the service is there, but I strongly recommend &lt;a href="http://www.acm.org/publications/Author-Izer-2011-update.pptx" target="_blank"&gt;to take a look at the presentation&lt;/a&gt;; it will give you an idea of what you get, and how to get it. However, and in few words, you make the &lt;strong&gt;final version of your papers at ACM publications available for free&lt;/strong&gt; to your readers, and in turn, you &lt;strong&gt;allow the ACM to get a more precise account of cites and downloads&lt;/strong&gt;. A win-win, because you benefit from knowing (and posting) your (registered) cites and downloads, as well.&lt;/p&gt;
&lt;p&gt;It would be redundant to explain the process here, but although it seems easy, it takes a while. And that is important, because you may want to get your papers made public for free &lt;strong&gt;instantly&lt;/strong&gt;. But well, it does not work so quickly. Some hints of the process:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;All right, you are told to create an &lt;a href="http://dl.acm.org/" target="_blank"&gt;ACM Digital Library&lt;/a&gt; Web user account. That is easy, just sign up. But the user name is automatically generated, so do not forget it or you will have to answer your secret question as you filled in the registration and read your email (delay!), or to re-create the account.&lt;/li&gt;
&lt;li&gt;Then you are requested to make at least a change to your "&lt;strong&gt;author profile&lt;/strong&gt;". But where is it? You can not find it in the ACM DL page, nor even being logged. That is because there is no link between you as a Web user and you as an author. So the quickest way to find the profile is to search for your name in the DL, and click on your author name at one of your papers. Then you edit it (suggestion, fill the form completely), and your changes stay pending until somebody at the DL authorizes them. Wait. It took a day on my case, but it can be hours if you ar in the same time frame as the ACM - it means, the US.&lt;/li&gt;
&lt;li&gt;Once your changes are admitted, you can access your author profile page. This should be mine: &lt;a href="http://dl.acm.org/author_page.cfm?id=81474676938&amp;amp;coll=DL&amp;amp;dl=GUIDE" target="_blank"&gt;ACM DL author profile for Jose Maria Gomez Hidalgo&lt;/a&gt;. If you are logged as a user, you will see a new export link at your pubs in the ACM journals/conferences: "ACM Author-Izer Service". You can export all of them with the same link at the begining of the page, but unless you have dozens and dozens, I do not recommend it. It is more easy going one by one.&lt;/li&gt;
&lt;li&gt;When you click one of the Author-Izer links, you get a popup for filling some details, and you get the code to insert in your papers page. Beware, you cave to write the URL of the page that will be hosting your paper page, or it will not be working. In my case, I tried several times until I got it: &lt;a href="http://www.esp.uem.es/jmgomez/papers/"&gt;http://www.esp.uem.es/jmgomez/papers/&lt;/a&gt;. Copy the code in your page, and that is it. It is done instantly.&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;So that is all. Now you can have a link to your camera-ready version, and the citation stats, in your page. Simple but with some delays. As a note, every time you make a change at your author profile, it is manually reviewed, so it may take time to get it live.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/6SW_d9xIcNI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/492154649456286751/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=492154649456286751&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/492154649456286751?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/492154649456286751?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/6SW_d9xIcNI/acm-author-izer-it-is-for-you-it-is-for.html" title="ACM Author-Izer: it is for you, it is for the ACM" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/10/acm-author-izer-it-is-for-you-it-is-for.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkIARXw8cSp7ImA9WhdbFEQ.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-4289938016807761068</id><published>2011-10-13T10:24:00.001+02:00</published><updated>2011-10-13T10:29:04.279+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-10-13T10:29:04.279+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Resources" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>JRC-Names - A freely available, highly multilingual named entity resource</title><content type="html">&lt;p&gt;&lt;strong&gt;JRC-Names&lt;/strong&gt; is a highly multilingual named entity resource for person and organisation names ('entities'). It consists of large lists of names and their many spelling variants (up to hundreds for a single person), including across scripts (Latin, Greek, Arabic, Cyrillic, Japanese, Chinese, etc.). The named entity resource file with the list of spelling variants is accompanied by Java-implemented demonstrator software that (a) allows to produce - for any input name - a list of known spelling variants, and that (b) analyses UTF8-encoded text files to find known entity mentions, returning the name variant found, the preferred display name for that entity, the unique name identifier for that name, the position of the entity name in the text, and its length in characters.&lt;/p&gt;
&lt;p&gt;To see examples, go to any of the over one million entity pages on EMM-NewsExplorer (e.g. that for Muammar Gaddafi at &lt;a href="http://emm.newsexplorer.eu/NewsExplorer/entities/en/262.html"&gt;http://emm.newsexplorer.eu/NewsExplorer/entities/en/262.html&lt;/a&gt;) to see the list of spelling variants automatically collected for that entity.&lt;/p&gt;
&lt;p&gt;JRC-Names is a /technical/ resource that can be used to find names even if they are spelled differently and to normalise name spellings in databases or other repositories. It is also a useful ingredient for IT systems that process text, e.g. for text mining, machine translation, social network generation, and other text mining applications involving named entities.&lt;/p&gt;
&lt;p&gt;JRC-Names is a by-product of the analysis of about 100,000 news reports per day by the *Europe Media Monitor* (EMM) family of applications (freely accessible at &lt;a href="http://emm.newsbrief.eu/overview.html"&gt;http://emm.newsbrief.eu/overview.html&lt;/a&gt;). It was mostly compiled automatically, by analysing hundreds of millions of news articles since the year 2004 in up to twenty languages, identifying names of entities (mostly persons, but also organisations, event names, and more), and detecting which of these newly found names are variant spellings of each other. Most name variants in JRC-Names are thus spellings that were found in real-life text (including frequent spelling mistakes). Additionally, for a subset of the collection of entities, software automatically extracted spelling variants in many further languages (e.g. Chinese, Thai, Japanese, ...) from the cross-lingual links in Wikipedia. For highly frequent or otherwise important names, the named entity resource was additionally manually verified. As JRC-Names was mostly produced automatically, it will contain some errors.&lt;/p&gt;
&lt;p&gt;At http://langtech.jrc.ec.europa.eu/, you find more information on the JRC's multilingual language technology activity, a download link for JRC-Names and a reference paper explaining the named entity resource, as well as a page pointing to other multilingual resources.&lt;/p&gt;
&lt;p&gt;Via &lt;a href="https://listserv.uned.es/mailman/listinfo/mavir-collab"&gt;MAVIR&lt;/a&gt; and &lt;a href="http://mailman.elsnet.org/mailman/listinfo/elsnet-list"&gt;Elsnet&lt;/a&gt; lists.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/Z4ENaDelLAU" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/4289938016807761068/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=4289938016807761068&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4289938016807761068?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/4289938016807761068?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/Z4ENaDelLAU/rc-names-freely-available-highly.html" title="JRC-Names - A freely available, highly multilingual named entity resource" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/10/rc-names-freely-available-highly.html</feedburner:origLink></entry><entry gd:etag="W/&quot;DkABRH49eSp7ImA9WhdbFEQ.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-6894374282843910911</id><published>2011-10-13T09:00:00.001+02:00</published><updated>2011-10-13T10:32:35.061+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-10-13T10:32:35.061+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Eventos" /><category scheme="http://www.blogger.com/atom/ns#" term="Text Mining" /><category scheme="http://www.blogger.com/atom/ns#" term="Recuperación de Información" /><category scheme="http://www.blogger.com/atom/ns#" term="Internet" /><category scheme="http://www.blogger.com/atom/ns#" term="NLP" /><category scheme="http://www.blogger.com/atom/ns#" term="Information Retrieval" /><title>VI Jornadas MAVIR: Tecnologías de Acceso a la Información: Estado actual y reto</title><content type="html">&lt;p&gt;&lt;strong&gt;VI Jornadas MAVIR
&lt;br/&gt;
Tecnologías de Acceso a la Información: Estado actual y retos
&lt;br/&gt;&lt;/strong&gt; 15 y 16 de noviembre de 2011
&lt;br/&gt;
ETSI Informática, URJC&lt;/p&gt;
&lt;p&gt;Registro gratuito: &lt;a href="http://6jmavir.appspot.com"&gt;http://6jmavir.appspot.com&lt;/a&gt;
&lt;br/&gt;
Información completa: &lt;a href="http://www.mavir.net/events/96-jornadas-mavir-2011"&gt;http://www.mavir.net/events/96-jornadas-mavir-2011&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;PROGRAMA&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;&lt;em&gt;martes 15/11/2011. Jornada Científica y Empresarial
&lt;br/&gt;
Salón de Grados, Edificio Interdepartamental II, URJC&lt;/em&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;9h00: Recepción y entrega de documentación&lt;/p&gt;
&lt;p&gt;9h30: Graph Distance Distribution for Social Network Mining
&lt;br/&gt;
Paolo Boldi (Università degli Studi di Milano)&lt;/p&gt;
&lt;p&gt;11h00: pausa café&lt;/p&gt;
&lt;p&gt;11h30: Ceremonia de Apertura de las Jornadas y Entrega del Premio MAVIR
&lt;br/&gt;
Raquel Martínez, Coordinadora Adjunta del Consorcio MAVIR
&lt;br/&gt;
Abraham Duarte, Investigador Principal del grupo GAVAB-URJC&lt;/p&gt;
&lt;p&gt;12h30: Exposición del Premio MAVIR 2011&lt;/p&gt;
&lt;p&gt;13h00: pausa almuerzo&lt;/p&gt;
&lt;p&gt;15h00: Understanding Text with Knowledge Bases and Random Walks
&lt;br/&gt;
Eneko Agirre (Universidad del País Vasco/Euskal Herriko Unibertsitatea)&lt;/p&gt;
&lt;p&gt;16h30: Mesa Redonda Empresas&lt;/p&gt;
&lt;p&gt;17h30: fin del primer día&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;&lt;em&gt;miércoles 15/11/2011. Jornada Académica
&lt;br/&gt;
Salón de Grados, Edificio Interdepartamental II, URJC&lt;/em&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;9h30: Recepción y entrega de documentación&lt;/p&gt;
&lt;p&gt;10h00: Presentación de las líneas de investigación actuales del Consorcio MAVIR
&lt;br/&gt;
Ronda de presentaciones a cargo de investigadores de los grupos
&lt;br/&gt;
CybermetricsLab-CSIC, HTL&amp;amp;IR-UAM, LABDA-UC3M, GSI-UEM, NLP&amp;amp;IR-UNED,
&lt;br/&gt;
THALES-UPM y GAVAB-URJC.&lt;/p&gt;
&lt;p&gt;13h30: clausura de las jornadas&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;&lt;strong&gt;PRESENTACIÓN&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;La Universidad Rey Juan Carlos acoge los próximos 15 y 16 de noviembre de 2011 la sexta edición de las Jornadas MAVIR. Este año, bajo el título "Tecnologías de Acceso a la Información: Estado actual y retos", reunimos charlas científicas a cargo de ponentes de prestigio internacional y presentaciones de empresas de base tecnológica que trabajan en las líneas de investigación prioritarias del consorcio. Entre los temas que se tratarán destacan el análisis de grafos para el análisis de las redes sociales, las técnicas de semántica computacional aplicadas a la comprensión de información textual y el estado actual de las tecnologías de acceso inteligente a la información.&lt;/p&gt;
&lt;p&gt;Como en años anteriores, aprovechamos la celebración de las jornadas para entregar la quinta edición del Premio MAVIR al mejor Trabajo de Fin de Carrera o Tesis de Máster en las áreas de investigación del consorcio. El premio está patrocinado por las empresas Bitext, Corex, Daedalus e iSOCO y cuenta con una dotación en metálico de 1.600 €.&lt;/p&gt;
&lt;p&gt;El Consorcio MAVIR es una red de investigación co-financiada por la Comunidad de Madrid y el Fondo Social Europeo bajo los programas de I+D en TIC MA2VICMR (2010-2013) y MAVIR (2006-2009). El núcleo del consorcio está formado por un equipo multidisciplinar de más de 50 ingenieros, científicos, lingüistas y documentalistas provenientes de grupos consolidados, 15 PYMEs de base tecnológica y empresas integradoras. Las actividades de I+D del consorcio se centran en el campo de las tecnologías lingüísticas aplicadas a los sistemas inteligentes de acceso y tratamiento de la información multimedia y multilingüe.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;DESTINATARIOS&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;div&gt;Empresas de base tecnológica relacionadas con las Tecnologías del Lenguaje Humano, los Sistemas Inteligentes de Acceso a la Información, los sistemas de Reconocimiento Automático del Habla, los buscadores y la Web, especialmente aquellas con interés en el acceso y gestión de contenidos multimedia y multilingües.&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Empresas integradoras con intereses en Sistemas Inteligentes de Acceso a la Información.&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Grupos de investigación y spin-offs con intereses en Transferencia de Tecnología en TIC.&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Organismos públicos y privados para los cuales la presencia en la Web es un factor estratégico.&lt;/div&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;div&gt;Investigadores y estudiantes interesados en áreas como el acceso a la información multimedia y multilingüe, el procesamiento del lenguaje natural, la creación de recursos lingüísticos, la extracción de información, los sistemas de búsqueda de respuestas, el reconocimiento del habla y la Web semántica.&lt;/div&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;LUGAR DE CELEBRACIÓN&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Salón de Grados, Edificio Interdepartamental II
&lt;br/&gt;
ETSI Informática, Universidad Rey Juan Carlos
&lt;br/&gt;
c/ Tulipán, s/n
&lt;br/&gt;
28933 Móstoles, Madrid&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;REGISTRO&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;La asistencia es abierta y gratuita. Dado lo limitado del aforo, es necesario registrarse y reservar plaza en el formulario de registro: &lt;a href="http://registro.mavir.net"&gt;http://registro.mavir.net&lt;/a&gt;&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/rsVQRmjBMAw" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/6894374282843910911/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=6894374282843910911&amp;isPopup=true" title="1 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6894374282843910911?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/6894374282843910911?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/rsVQRmjBMAw/vi-jornadas-mavir-tecnologias-de-acceso.html" title="VI Jornadas MAVIR: Tecnologías de Acceso a la Información: Estado actual y reto" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>1</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/10/vi-jornadas-mavir-tecnologias-de-acceso.html</feedburner:origLink></entry><entry gd:etag="W/&quot;D0AARXczeyp7ImA9WhZbEkQ.&quot;"><id>tag:blogger.com,1999:blog-36589303.post-3273505205448274624</id><published>2011-06-17T09:29:00.001+02:00</published><updated>2011-06-17T09:29:04.983+02:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2011-06-17T09:29:04.983+02:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Security" /><category scheme="http://www.blogger.com/atom/ns#" term="Spam" /><category scheme="http://www.blogger.com/atom/ns#" term="English" /><title>Comment in New Scientist feature on Crowsourced SMS Spam filter</title><content type="html">&lt;p&gt;&lt;a href="http://www.newscientist.com/search?rbauthors=Jacob+Aron" target="_blank"&gt;Jakob Aron&lt;/a&gt; from &lt;a href="http://www.newscientist.com/" target="_blank"&gt;New Scientist&lt;/a&gt; interviewed me some time a go regarding the following paper:&lt;/p&gt;
&lt;blockquote style="MARGIN-RIGHT: 0px" dir="ltr"&gt;
&lt;p&gt;Kuldeep Yadav, Ponnurangam Kumaraguru, Atul Goyal, Ashish Gupta, Vinayak Naik. &lt;strong&gt;&lt;a href="http://docs.google.com/viewer?url=https://sites.google.com/site/atulgoyalpersona/uploads/hotmobile11-paper15.pdf" target="_blank"&gt;&lt;strong&gt;SMSAssassin: Crowdsourcing Driven Mobile-based System for SMS Spam Filtering&lt;/strong&gt;&lt;/a&gt;&lt;/strong&gt;.
&lt;br/&gt;
12th Workshop on Mobile Computing Systems and Applications (HotMobile 2011).&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;You can find my opinion at the feature: &lt;a href="http://www.newscientist.com/article/dn20175-crowdsourced-software-could-stop-sms-spam.html" target="_blank"&gt;Crowdsourced software could stop SMS spam&lt;/a&gt;.&lt;/p&gt;&lt;img src="http://feeds.feedburner.com/~r/NihilObstat/~4/W-BuzxCsTKQ" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://jmgomezhidalgo.blogspot.com/feeds/3273505205448274624/comments/default" title="Enviar comentarios" /><link rel="replies" type="text/html" href="http://www.blogger.com/comment.g?blogID=36589303&amp;postID=3273505205448274624&amp;isPopup=true" title="0 comentarios" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3273505205448274624?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/36589303/posts/default/3273505205448274624?v=2" /><link rel="alternate" type="text/html" href="http://feedproxy.google.com/~r/NihilObstat/~3/W-BuzxCsTKQ/comment-in-new-scientist-feature-on.html" title="Comment in New Scientist feature on Crowsourced SMS Spam filter" /><author><name>Jose Maria Gomez Hidalgo</name><uri>https://plus.google.com/111508951071404188069</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="//lh3.googleusercontent.com/-MHSBnZhAOjQ/AAAAAAAAAAI/AAAAAAAABek/fe6w-wiXWpM/s512-c/photo.jpg" /></author><thr:total>0</thr:total><feedburner:origLink>http://jmgomezhidalgo.blogspot.com/2011/06/comment-in-new-scientist-feature-on.html</feedburner:origLink></entry></feed>
