<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" version="2.0"><channel><atom:id>tag:blogger.com,1999:blog-3639231664593965268</atom:id><lastBuildDate>Thu, 01 Mar 2012 09:41:52 +0000</lastBuildDate><category>Google Maps</category><category>FP7</category><category>DSpace</category><category>Digital Libraries</category><category>Yahoo PlaceFinder</category><category>Athos Memory</category><category>Selenium</category><category>iPhone simulator</category><category>E-Learning</category><category>Data transformations</category><category>Europeana</category><category>Michelin Maps</category><category>Federated search</category><category>Music Library Lilian Voudouri</category><category>Linked Data</category><category>PDF Downloader</category><category>Wikipedia</category><category>Downloading</category><category>Agents</category><category>Z39.50</category><category>TEL-MAP</category><category>Scraping</category><category>Veria Central Public Library</category><category>Institutional repositories</category><category>Διαύγεια</category><category>Geographic data</category><category>wget</category><category>XSLT</category><category>openarchives.gr</category><category>CAQDA</category><category>Qualitative Analysis</category><category>Robots.txt</category><category>XML</category><category>Web services</category><category>Dublin Core</category><category>Social sites</category><category>Mobile devices</category><category>Forums</category><category>Mobile apps</category><category>APIs</category><category>ΥπερΔιαύγεια</category><category>dbWiz</category><category>myVisitPlanner</category><category>Tech Box</category><category>Ethnography</category><category>Wrappers</category><category>Netnography</category><category>Search engines</category><category>Geo-location</category><category>Open Archives</category><category>JavaScript</category><category>spynner</category><category>OAI-PMH</category><title>deixto.com/blog</title><description /><link>http://deixto.blogspot.com/</link><managingEditor>noreply@blogger.com (kntonas)</managingEditor><generator>Blogger</generator><openSearch:totalResults>20</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/deixtocom/blog" /><feedburner:info xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" uri="deixtocom/blog" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-8274736727794590973</guid><pubDate>Sun, 19 Feb 2012 06:08:00 +0000</pubDate><atom:updated>2012-02-22T22:49:23.616+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Europeana</category><category domain="http://www.blogger.com/atom/ns#">Linked Data</category><category domain="http://www.blogger.com/atom/ns#">OAI-PMH</category><category domain="http://www.blogger.com/atom/ns#">Digital Libraries</category><category domain="http://www.blogger.com/atom/ns#">Dublin Core</category><title>Linked Data &amp; DEiXTo</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;As explained in a&amp;nbsp;&lt;a href="http://deixto.blogspot.com/2012/01/open-archives-digital-libraries.html" target="_blank"&gt;previous post&lt;/a&gt;,&amp;nbsp;&lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt;&amp;nbsp;can scrape the content of digital libraries, archives and multimedia collections lacking an &lt;a href="http://en.wikipedia.org/wiki/Application_programming_interface" target="_blank"&gt;API&lt;/a&gt;&amp;nbsp;and enable their metadata&amp;nbsp;transformation (through post-processing and&amp;nbsp;custom Perl code)&amp;nbsp;to&amp;nbsp;&lt;a href="http://dublincore.org/" target="_blank"&gt;Dublin Core&lt;/a&gt;&amp;nbsp;and subsequently in&amp;nbsp;&lt;a href="http://www.openarchives.org/pmh/" target="_blank"&gt;OAI-PMH&lt;/a&gt;&amp;nbsp;or another suitable form, e.g.&amp;nbsp;&lt;a href="http://www.europeana.eu/portal/" target="_blank"&gt;Europeana&lt;/a&gt;&amp;nbsp;Semantic Elements (&lt;a href="http://www.europeana.eu/schemas/ese/" target="_blank"&gt;ESE&lt;/a&gt;).&lt;br /&gt;
&amp;nbsp; &amp;nbsp; Meanwhile,&amp;nbsp;the Web has become a dynamic collaboration platform that allows everyone to meet, read and more importantly write. Thus, it steadily approaches the vision of &lt;a href="http://www.w3.org/People/Berners-Lee/" target="_blank"&gt;Tim Berners-Lee&lt;/a&gt; (the inventor of the World Wide Web): the &lt;a href="http://linkeddata.org/" target="_blank"&gt;Linked Data&lt;/a&gt; Web, a place where related data are linked and information is represented in a more structured and easily machine-processable way.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; &lt;a href="http://www.w3.org/DesignIssues/LinkedData.html" target="_blank"&gt;Linked Data&lt;/a&gt; refers to a set of best practices for publishing and connecting structured data on the Web. Its key technologies are &lt;a href="http://en.wikipedia.org/wiki/Uniform_resource_identifier" target="_blank"&gt;URIs&lt;/a&gt; (a generic method to identify resources on the Internet), the&amp;nbsp;&lt;a href="http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol" target="_blank"&gt;Hypertext Transfer Protocol&lt;/a&gt;&amp;nbsp;(HTTP) and &lt;a href="http://www.w3.org/TR/rdf-primer/" target="_blank"&gt;RDF&lt;/a&gt; (a data model and a general method for conceptual description of things in the real world). It is an exciting topic of interest and it's expected to make great progress in the next few years. A video that does a nice job of explaining what Linked Open Data is all about can be found here: &lt;a href="http://vimeo.com/36752317"&gt;http://vimeo.com/36752317&lt;/a&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-3MzqfPm192A/TxiME826ErI/AAAAAAAAAIA/LTi_QBTEnr0/s1600/lod-datasets_2009-07-14_cropped.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="297" src="http://4.bp.blogspot.com/-3MzqfPm192A/TxiME826ErI/AAAAAAAAAIA/LTi_QBTEnr0/s400/lod-datasets_2009-07-14_cropped.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;Over the last decade, the&amp;nbsp;Open Archives Initiative Protocol for Metadata Harvesting (&lt;a href="http://www.openarchives.org/pmh/" target="_blank"&gt;OAI-PMH&lt;/a&gt;)&amp;nbsp;has become the de facto standard for metadata exchange in digital libraries and it's playing an increasingly important role.&amp;nbsp;However, it has two major drawbacks: it does not make its resources accessible via dereferencable URIs and it provides only restricted means of selective access to metadata.&amp;nbsp;Therefore, there is a strong need for&amp;nbsp;efficient&amp;nbsp;tools that would allow&amp;nbsp;metadata repositories to expose their content&amp;nbsp;according to the&amp;nbsp;&lt;span class="s1"&gt;Linked Data&lt;/span&gt;&amp;nbsp;&lt;a href="http://www.w3.org/DesignIssues/LinkedData.html" target="_blank"&gt;guidelines&lt;/a&gt;. This would make&amp;nbsp;digitized items and media objects accessible via HTTP URIs and&amp;nbsp;query able&amp;nbsp;via the&amp;nbsp;&lt;a href="http://www.w3.org/TR/rdf-sparql-query/" target="_blank"&gt;SPARQL&lt;/a&gt;&amp;nbsp;protocol.&lt;br /&gt;
&amp;nbsp; &amp;nbsp; &lt;a href="http://www.linkedin.com/in/bernhardhaslhofer" target="_blank"&gt;Dr&amp;nbsp;Haslhofer&lt;/a&gt; has performed significant research and work towards this direction. He has&amp;nbsp;developed (among others) the &lt;a href="http://www.mediaspaces.info/tools/oai2lod/" target="_blank"&gt;OAI2LOD Server&lt;/a&gt;&amp;nbsp;based on the &lt;a href="http://sourceforge.net/projects/d2rq-map/" target="_blank"&gt;&lt;span class="s1"&gt;D2R Server&lt;/span&gt;&lt;/a&gt; implementation and wrote the &lt;a href="https://github.com/behas/ese2edm" target="_blank"&gt;ESE2EDM&lt;/a&gt;&amp;nbsp;converter, a collection of ruby scripts that can convert given&amp;nbsp;XML-based ESE&amp;nbsp;source files into the RDF-based Europeana Data Model (&lt;a href="http://pro.europeana.eu/web/guest/edm-documentation" target="_blank"&gt;EDM&lt;/a&gt;). These remarkable tools could turn out very useful for making large volumes of information Linked-Data ready, with all the advantages this brings.&lt;br /&gt;
&amp;nbsp; &amp;nbsp; Linked&amp;nbsp;Open&amp;nbsp;Data can change the computer world as we know it. So, there is a lot of potential in combining &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; with Linked Data technologies. Their blend could eventually produce an innovative and useful outcome. Many already believe&amp;nbsp;that Linked Data is the next big thing. Time will tell. Meanwhile,&amp;nbsp;&lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; could definitely help you&amp;nbsp;generate structured data in a variety of formats from unstructured HTML pages, either&amp;nbsp;your ultimate goal is&amp;nbsp;Linked Data or not.&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-8274736727794590973?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/02/linked-data-deixto.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-3MzqfPm192A/TxiME826ErI/AAAAAAAAAIA/LTi_QBTEnr0/s72-c/lod-datasets_2009-07-14_cropped.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-2194704879425998273</guid><pubDate>Sat, 11 Feb 2012 07:42:00 +0000</pubDate><atom:updated>2012-02-11T14:19:33.144+02:00</atom:updated><title>DEiXTo components clarified</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;From the emails and feedback received, it seems that many people get a bit confused about the&amp;nbsp;utility&amp;nbsp;and functionality of the &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; GUI tool compared to the &lt;a href="http://www.perl.org/" target="_blank"&gt;Perl&lt;/a&gt; command line executor (CLE). DEiXToBot is even more confusing for quite a few users. So, let's clarify things.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The GUI tool&amp;nbsp;is freeware (available&amp;nbsp;at no cost but without any source code, at least yet) and it allows you to visually build and execute extraction rules for web pages of interest with point and click convenience. It offers you an embedded web browser and a friendly graphical interface so that you can highlight an element/ record instance as the mouse moves over it. The GUI tool is a Windows-only application that harnesses Internet Explorer's HTML parser and render engine.&amp;nbsp;&amp;nbsp;It is worth noting that it can support simple&amp;nbsp;&lt;a href="http://deixto.blogspot.com/2011/12/cooperating-deixto-agents.html" target="_blank"&gt;cooperative extraction scenarios&lt;span id="goog_1052480954"&gt;&lt;/span&gt;&lt;/a&gt;&amp;nbsp;as well as periodic, scheduled execution through batch files and the Windows Task Scheduler.&amp;nbsp;Perhaps its main drawback is that it can execute just one pattern on a page although for several cases (maybe for the majority) one and only extraction rule is enough to get the job done.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; On the other hand, the command line executor, or CLE for short, is implemented in Perl and it is freely distributed under the &lt;a href="http://www.gnu.org/licenses/gpl.html" target="_blank"&gt;GNU General Public License&lt;/a&gt;&amp;nbsp;v3, thus its source code is included. Its purpose is to execute wrapper project files (.wpf) that have previously been created with the GUI tool. It runs on a DOS prompt window or on a Linux/ Mac terminal. &amp;nbsp;Besides the code though, we have built two standalone executables so that you can run CLE either on a Windows or a &lt;a href="http://www.gnu.org/gnu/linux-and-gnu.html" target="_blank"&gt;GNU/Linux&lt;/a&gt;&amp;nbsp;machine&amp;nbsp;without having Perl or any prerequisite modules&amp;nbsp;installed. CLE is faster, offers more output formats and has some add&lt;span id="goog_987015250"&gt;&lt;/span&gt;&lt;span id="goog_987015251"&gt;&lt;/span&gt;&lt;a href="http://www.blogger.com/"&gt;&lt;/a&gt;itional features such as an efficient &lt;a href="http://deixto.wikispaces.com/message/view/home/32988104" target="_blank"&gt;post-processing mechanism&lt;/a&gt; and database support.&amp;nbsp;However, it shares the same shortcoming as the GUI tool: it&amp;nbsp;supports&amp;nbsp;just one pattern on a page.&amp;nbsp;Finally, it relies on DEiXToBot, a "homemade" package that facilitates the&amp;nbsp;execution of&amp;nbsp;GUI &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; generated wrappers.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;DEiXToBot is&amp;nbsp;the third and probably the most powerful and well-crafted software component of the DEiXTo scraping suite and it is available under the GPL v3 license. It is a Perl module based on &lt;a href="http://search.cpan.org/~kntonas/WWW-Mechanize-Sleepy-0.7/Sleepy.pm" target="_blank"&gt;WWW::Mechanize::Sleepy&lt;/a&gt;, a handy web browser Perl object, and several other CPAN modules. It allows extensive customization and tailor-made solutions since it facilitates the combination of &lt;i&gt;multiple&lt;/i&gt; extraction rules/ patterns as well as the post-processing of their results through custom code. Therefore, it can deal with complex cases and cover more advanced web scraping needs. But it requires programming skills in order to use it.&amp;nbsp;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The bottom line is that DEiXToBot is the essence of our long experience. The GUI tool might be more suitable for most every-day users (due to its visual convenience) but when things get&amp;nbsp;difficult or the situation requires a more&amp;nbsp;advanced&amp;nbsp;solution (e.g. scheduled or on-demand execution and coordination of multiple wrappers on a &lt;a href="http://www.gnu.org/gnu/linux-and-gnu.html" target="_blank"&gt;GNU/Linux&lt;/a&gt; server), a customized DEiXToBot-based script is your choice. You can use the GUI tool first to create the necessary patterns and then deploy a Perl script that uses them to extract structured data from the pages of the target website. So, if you are familiar with Perl, you should not find it very hard to write your first &lt;a href="http://deixto.com/" target="_blank"&gt;deixto&lt;/a&gt;-based spider/ crawler!&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-2194704879425998273?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/02/deixto-components-clarified.html</link><author>noreply@blogger.com (kntonas)</author><thr:total>1</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-1491882698361958417</guid><pubDate>Sat, 28 Jan 2012 00:03:00 +0000</pubDate><atom:updated>2012-01-29T22:38:59.191+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Federated search</category><category domain="http://www.blogger.com/atom/ns#">APIs</category><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">dbWiz</category><category domain="http://www.blogger.com/atom/ns#">Search engines</category><category domain="http://www.blogger.com/atom/ns#">Z39.50</category><title>Federated searching &amp; dbWiz</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;Nowadays, most university and college students, professors&amp;nbsp;as well as&amp;nbsp;researchers&amp;nbsp;are increasingly&amp;nbsp;seeking&amp;nbsp;information&amp;nbsp;and&amp;nbsp;finding&amp;nbsp;answers&amp;nbsp;on the open Web. Google has become the dominant search tool for&amp;nbsp;almost&amp;nbsp;everyone. Its popularity is enormous, no need to wonder or analyze why. It's just great and it returns "good enough" results really fast.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-9NP2Wae8dHk/TyKTRp5jDvI/AAAAAAAAAIg/pqsuWP9KPCA/s1600/logo3w.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="68" src="http://3.bp.blogspot.com/-9NP2Wae8dHk/TyKTRp5jDvI/AAAAAAAAAIg/pqsuWP9KPCA/s200/logo3w.png" width="200" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; However, libraries, in their effort to win some&amp;nbsp;patrons&amp;nbsp;back, have tried to offer a decent searching alternative by developing a new model: federated search engines. &lt;a href="http://en.wikipedia.org/wiki/Federated_search" target="_blank"&gt;Federated searching&lt;/a&gt; (also known as metasearch or cross searching) allows users to search simultaneously multiple web resources and&amp;nbsp;subscription-based bibliographic databases from a single interface. To achieve that, parallel processes are executed in real time and retrieve results from each separate source. Τhen, the results returned&amp;nbsp;get grouped together and presented&amp;nbsp;to the user&amp;nbsp;in a unified way.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The mechanisms used for pulling the data from the target sources are broadly two: either through an &lt;a href="http://en.wikipedia.org/wiki/Application_programming_interface" target="_blank"&gt;Application Programming Interface&lt;/a&gt; (API) or via &lt;a href="http://en.wikipedia.org/wiki/Web_scraping" target="_blank"&gt;scraping&lt;/a&gt; the native web interface/ site of each database.&amp;nbsp;The first method is undoubtedly better but very often a search API is not available. In such cases, &lt;a href="http://en.wikipedia.org/wiki/Internet_bot" target="_blank"&gt;web robots&lt;/a&gt; (or agents) come into play and capture information of interest, typically by simulating a human browsing through the target webpages.&lt;br /&gt;
&amp;nbsp; &amp;nbsp; Especially in the academia, there are numerous online bibliographic databases. Some of them offer &lt;a href="http://en.wikipedia.org/wiki/Z39.50"&gt;Z39.50&lt;/a&gt;&amp;nbsp;or API access. However, a large number still does not provide protocol-based search functionality. Thus, scraping techniques should be deployed for those (unless the vendor disallows bots).&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;a href="http://1.bp.blogspot.com/-XG3GsrNthpo/TyKBF3OPyDI/AAAAAAAAAII/yCMkT0vmLDw/s1600/dbwiz.png" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em; text-align: center;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-XG3GsrNthpo/TyKBF3OPyDI/AAAAAAAAAII/yCMkT0vmLDw/s1600/dbwiz.png" /&gt;&lt;/a&gt;&amp;nbsp; &amp;nbsp;When starting my programming adventure with &lt;a href="http://www.perl.org/" target="_blank"&gt;Perl&lt;/a&gt; back in 2006, in the context of my former full-time job at the &lt;a href="http://www.lib.uom.gr/index.php?lang=utf-8" target="_blank"&gt;Library of University of Macedonia&lt;/a&gt;&amp;nbsp;(Thessaloniki,&amp;nbsp;Greece), I had the chance (and luck) to run across &lt;a href="http://researcher.sfu.ca/dbwiz" target="_blank"&gt;dbWiz&lt;/a&gt;, a remarkable &lt;a href="http://www.opensource.org/" target="_blank"&gt;open source&lt;/a&gt;, federated search tool developed by the &lt;a href="http://www.lib.sfu.ca/" target="_blank"&gt;Simon Fraser University&amp;nbsp;(SFU)&amp;nbsp;Library&lt;/a&gt;&amp;nbsp;in Canada. I was fascinated with Perl as well as dbWiz's internal design and implementation. So, this is how I met and fell in love with Perl.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; dbWiz offered a friendly and usable admin interface that allowed you to create search categories and select from a global list of resources which databases would be active and searchable. If you had to add a new resource though, you would have to write your own plugin (Perl knowledge and programming skills were required). Some of the dbWiz search plugins were based upon Z39.50 whereas others (the majority) relied on &lt;a href="http://en.wikipedia.org/wiki/Regular_expression" target="_blank"&gt;regular expressions&lt;/a&gt; and &lt;a href="http://search.cpan.org/~jesse/WWW-Mechanize-1.71/lib/WWW/Mechanize.pm" target="_blank"&gt;WWW::Mechanize&lt;/a&gt;&amp;nbsp;(a handy web browser Perl object).&lt;br /&gt;
&amp;nbsp; &amp;nbsp; The federated search engine developed while working&amp;nbsp;at the University of Macedonia (2006-2008)&amp;nbsp;was named "&lt;a href="http://pantou.lib.uom.gr/modperl/dbwiz2.pl" target="_blank"&gt;Pantou&lt;/a&gt;" and became a valuable everyday tool for students and professors of the University. The results of this work &lt;a href="http://www.lib.uom.gr/images/stories/pdf/dimosieuseis/federated_search.pdf" target="_blank"&gt;were presented&lt;/a&gt; at the&amp;nbsp;&lt;a href="http://libconf2007.unipi.gr/index.php?lang=en" target="_blank"&gt;16th Panhellenic Academic Libraries Conference &lt;/a&gt;(Piraeus, 1-3 October 2007). Unfortunately, its maintenance stopped at the end of 2010 due to the economic crisis and severe cuts in funding. Consequently, a few months later some of its plugins started falling apart.&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-cpRSKLd_TCk/TyMFJlWO76I/AAAAAAAAAIo/x8EVqJ60h8Q/s1600/ScreenShot_Pantou.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="302" src="http://2.bp.blogspot.com/-cpRSKLd_TCk/TyMFJlWO76I/AAAAAAAAAIo/x8EVqJ60h8Q/s400/ScreenShot_Pantou.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Generally, delving into dbWiz taught me a lot of lessons such as web development, Perl programming and &lt;a href="http://www.gnu.org/gnu/linux-and-gnu.html" target="_blank"&gt;GNU/Linux&lt;/a&gt; administration. I loved it! Meanwhile, in my effort to improve the relatively hard and tedious procedure of creating new dbWiz plugins, I put into practice an early version of GUI &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; (which was my MSc thesis being fulfilled in the same period at the &lt;a href="http://www.auth.gr/home/index_en.html" target="_blank"&gt;Aristotle University of Thessaloniki&lt;/a&gt;). The result was a &lt;a href="http://lib-code.lib.sfu.ca/projects/dbwiz/browser/trunk/DBWIZ_search/lib/DBWIZ/Search/Internet/DEiXTo.pm?rev=691" target="_blank"&gt;new Perl module&lt;/a&gt; that allowed the execution of &lt;a href="http://www.w3.org/DOM/" target="_blank"&gt;W3C DOM&lt;/a&gt;-based, XML patterns (built with the GUI DEiXTo) inside dbWiz and eliminated, at least to a large extent, the need for heavy use of regular expressions. That module, which was the first predecessor of today's DEiXToBot package,&amp;nbsp;&lt;a href="http://lib-code.lib.sfu.ca/projects/dbwiz/browser/trunk/DBWIZ_search/lib/DBWIZ/Search/Internet/DEiXTo.pm?rev=691" target="_blank"&gt;got included in the official dbWiz distribution&lt;/a&gt; after contacting the dbWiz development team in 2007. Unfortunately, SFU Library &lt;a href="http://lib-forums.lib.sfu.ca/viewtopic.php?f=1&amp;amp;t=329" target="_blank"&gt;ended the support&lt;/a&gt; and development of dbWiz in 2010.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Looking back, I can now say with quite a bit of certainty, that &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt;&amp;nbsp;(more than ever before) can power federated search tools and help them extend their reach to previously inaccessible resources. As far as the search engines war is concerned, Google seems to triumph but nobody can say for sure what is going to happen in the next few years to come. Time will tell..&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-1491882698361958417?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/01/federated-searching-dbwiz.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-9NP2Wae8dHk/TyKTRp5jDvI/AAAAAAAAAIg/pqsuWP9KPCA/s72-c/logo3w.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-7092960453956243431</guid><pubDate>Thu, 19 Jan 2012 22:02:00 +0000</pubDate><atom:updated>2012-01-28T19:06:19.669+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">DSpace</category><category domain="http://www.blogger.com/atom/ns#">Music Library Lilian Voudouri</category><category domain="http://www.blogger.com/atom/ns#">openarchives.gr</category><category domain="http://www.blogger.com/atom/ns#">Institutional repositories</category><category domain="http://www.blogger.com/atom/ns#">OAI-PMH</category><category domain="http://www.blogger.com/atom/ns#">Digital Libraries</category><category domain="http://www.blogger.com/atom/ns#">Dublin Core</category><category domain="http://www.blogger.com/atom/ns#">Data transformations</category><category domain="http://www.blogger.com/atom/ns#">Open Archives</category><title>Open Archives &amp; Digital Libraries</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;&lt;a href="http://4.bp.blogspot.com/-JBOZ8oDt4zg/TxcmN7n9cvI/AAAAAAAAAH4/XZ6vXiRbaqM/s1600/OA100.gif" imageanchor="1" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"&gt;&lt;br /&gt;
&lt;img border="0" src="http://4.bp.blogspot.com/-JBOZ8oDt4zg/TxcmN7n9cvI/AAAAAAAAAH4/XZ6vXiRbaqM/s1600/OA100.gif" /&gt;&lt;/a&gt;The &lt;a href="http://www.openarchives.org/" target="_blank"&gt;Open Archives Initiative&lt;/a&gt; (OAI) develops and promotes interoperability standards that aim to facilitate the efficient dissemination of content. OAI has its roots in the open access and &lt;a href="http://en.wikipedia.org/wiki/Institutional_repository" target="_blank"&gt;institutional repository&lt;/a&gt; movements and its cornerstone is the&amp;nbsp;Protocol for Metadata Harvesting (&lt;a href="http://www.openarchives.org/OAI/openarchivesprotocol.html" target="_blank"&gt;OAI-PMH&lt;/a&gt;) which allows data providers/ repositories to expose their content in a structured format. A client then can make OAI-PMH service requests to harvest that metadata through HTTP.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; &lt;a href="http://openarchives.gr/"&gt;openarchives.gr&lt;/a&gt; is a great federated search engine harvesting &lt;i&gt;57 &lt;/i&gt;Greek digital libraries and institutional repositories (as of January 2012). It currently provides access to almost half a million(!) documents (mainly undergraduate theses and Master/ PhD dissertations) and its index gets updated on a daily basis. It&amp;nbsp;began its operation back in 2006 after being designed and implemented by&amp;nbsp;&lt;a href="http://vbanos.gr/" target="_blank"&gt;Vangelis Banos&lt;/a&gt;&amp;nbsp;but&amp;nbsp;since May 2011 it is being hosted, managed and co-developed by the &lt;a href="http://www.ekt.gr/" target="_blank"&gt;National Documentation Centre&lt;/a&gt; (EKT). What makes this amazing searching tool even more remarkable is the fact that it is entirely built on &lt;a href="http://www.opensource.org/" target="_blank"&gt;open source&lt;/a&gt;/ &lt;a href="http://www.gnu.org/philosophy/free-sw.html" target="_blank"&gt;free software&lt;/a&gt;.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;a href="http://3.bp.blogspot.com/-SUPy8mlo9jU/TxccCIC7rYI/AAAAAAAAAHw/OJ5usbcHbrk/s1600/logo_en.png" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-SUPy8mlo9jU/TxccCIC7rYI/AAAAAAAAAHw/OJ5usbcHbrk/s1600/logo_en.png" /&gt;&lt;/a&gt;&amp;nbsp; &amp;nbsp; A tricky point that needs some clarification is that when a user searches &lt;a href="http://openarchives.gr/"&gt;openarchives.gr&lt;/a&gt;, the search is not submitted in real time to the target sources. Instead, it is performed locally on the&amp;nbsp;openarchives.gr server&amp;nbsp;where full copies of the repositories/ libraries are stored (and updated at regular time intervals).&lt;br /&gt;
&amp;nbsp; &amp;nbsp; The majority of the sources searched by openarchives.gr are OAI-PMH compliant repositories (such as &lt;a href="http://www.dspace.org/" target="_blank"&gt;DSpace&lt;/a&gt; or &lt;a href="http://www.eprints.org/" target="_blank"&gt;EPrints&lt;/a&gt;). Therefore, their data are periodically retrieved via their OAI-PMH endpoint. However, it is worth mentioning that non OAI-PMH digital libraries have also been included in its database. This was made possible through scraping their websites with &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; and&amp;nbsp;transforming&amp;nbsp;their&amp;nbsp;metadata&amp;nbsp;into&amp;nbsp;&lt;a href="http://dublincore.org/" target="_blank"&gt;Dublin Core&lt;/a&gt;. So, more than &lt;i&gt;16.000&lt;/i&gt; records from &lt;i&gt;6&lt;/i&gt; significant online digital libraries (such as the&amp;nbsp;&lt;a href="http://www.lykeionellinidon.gr/lyceumportal/" target="_blank"&gt;Lyceum Club of Greek Women&lt;/a&gt;&amp;nbsp;and the&amp;nbsp;&lt;a href="http://digma.mmb.org.gr/Default.aspx" target="_blank"&gt;Music Library&lt;/a&gt;&amp;nbsp;of Greece “Lilian Voudouri”) were inserted in openarchives.gr with the use of DEiXTo wrappers and custom Perl code.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Finally, it is known that digital collections have flourished over the last few years and&amp;nbsp;enjoy growing popularity. However, most of them do NOT provide their contents in OAI-PMH or another appropriate metadata format. Actually, many of them (especially legacy systems) do NOT even offer an &lt;a href="http://en.wikipedia.org/wiki/Application_programming_interface" target="_blank"&gt;API&lt;/a&gt; or an &lt;a href="http://en.wikipedia.org/wiki/Search/Retrieve_Web_Service" target="_blank"&gt;SRW/U&lt;/a&gt; interface. Consequently, we believe that there is much room for &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; to help cultural and educational organizations (e.g., museums, archives, libraries and multimedia collections) to export, present and&amp;nbsp;distribute&amp;nbsp;their&amp;nbsp;digitized&amp;nbsp;items and rich content to the outside world, in an efficient and structured way, through scraping and repurposing their data.&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-7092960453956243431?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/01/open-archives-digital-libraries.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-JBOZ8oDt4zg/TxcmN7n9cvI/AAAAAAAAAH4/XZ6vXiRbaqM/s72-c/OA100.gif" height="72" width="72" /><thr:total>2</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-6296717665748183658</guid><pubDate>Tue, 17 Jan 2012 13:05:00 +0000</pubDate><atom:updated>2012-01-28T18:14:53.959+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Forums</category><category domain="http://www.blogger.com/atom/ns#">Netnography</category><category domain="http://www.blogger.com/atom/ns#">Qualitative Analysis</category><category domain="http://www.blogger.com/atom/ns#">Social sites</category><category domain="http://www.blogger.com/atom/ns#">Ethnography</category><category domain="http://www.blogger.com/atom/ns#">CAQDA</category><title>Netnography &amp; Scraping</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;&lt;a href="http://en.wikipedia.org/wiki/Netnography" target="_blank"&gt;Netnography&lt;/a&gt;&amp;nbsp;or digital ethnography, is (or should be) the correct translation of ethnographic methods to online environments such as bulletin boards and social sites. It is more or less doing the same that ethnographers do in actual places like squares, pubs, clubs, etc:&amp;nbsp;observe what people say and do, and try to participate as much as possible in order to better understand what's involved in action and discourses. Using ethnography&amp;nbsp;may answer a lot of what, when, who and how questions defining several everyday problems. However,&amp;nbsp;netnography&amp;nbsp;differs in many ways compared to ethnography; especially in the fashion it is conducted.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Forums, Wikis as well as the blogosphere are good online equivalents of public squares and pubs. There are not physical identities, but online&amp;nbsp;ones; there are not faces, but avatars; there is no gender, age or&amp;nbsp;any reliable info about physical identities, but there are voices&amp;nbsp;discussing and arguing about common topics of interests.&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-G4XEUIdlsVI/TxVgZWVXFJI/AAAAAAAAAHo/jrprte9TgGQ/s1600/soc-icons.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="86" src="http://2.bp.blogspot.com/-G4XEUIdlsVI/TxVgZWVXFJI/AAAAAAAAAHo/jrprte9TgGQ/s400/soc-icons.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The more popular a forum is, the more difficult it gets to follow it&amp;nbsp;nethnographically. A nethnographer has to use a Computer Assisted Qualitative Data Analysis (&lt;a href="http://en.wikipedia.org/wiki/Computer_assisted_qualitative_data_analysis_software" target="_blank"&gt;CAQDA&lt;/a&gt;) tool (such as &lt;a href="http://rqda.r-forge.r-project.org/" target="_blank"&gt;RDQA&lt;/a&gt;) on&amp;nbsp;certain parts of the texts collected during his&amp;nbsp;research. In a forum use case, these texts would be posts and threads.&amp;nbsp;If the researcher has to browse the forum and manually copy and paste its content, a huge amount of effort would be required. However, this obstacle could be surpassed through scraping the forum with a web data extraction tool such as &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt;.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; A scraped forum is a jewel: perfectly ordered textual data corresponding to each thread, ready for further analysis. So, this is where DEiXTo comes into play and may boost the research process&amp;nbsp;significantly. To our knowledge,&amp;nbsp;&lt;a href="http://www.linkedin.com/in/jlchulilla" target="_blank"&gt;Dr&amp;nbsp;Juan Luis Chulilla Cano&lt;/a&gt;, CEO of &lt;a href="http://www.onlineandoffline.net/" target="_blank"&gt;Online and Offline Ltd&lt;/a&gt;., has been successfully&amp;nbsp;utilizing&amp;nbsp;scraping techniques so as to capture the threads of popular Spanish forums (and their metadata) and transform them into a structured format, suitable for&amp;nbsp;post-processing. Typically, such sites have a common presentation style for their threads and offer rich metadata. Thus, they are potential goldmines upon which various methodologies can be tested and applied so as to discover knowledge and trends and draw useful conclusions.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Finally, netnography and anthropology seem to be gaining momentum over the last few years. They are really interesting as well as challenging fields and scraping could evolve to an important ally. It is worth mentioning that quite a few IT vendors and firms employ ethnographers for R&amp;amp;D and testing of new products. Therefore, there is a lot of potential in using computer aided techniques in the context of&amp;nbsp;netnography. So, if you are coming from social sciences&amp;nbsp;and creating wrappers/ extraction rules is not your second nature, why don't you &lt;a href="http://deixto.com/contact.php" target="_blank"&gt;drop us an email&lt;/a&gt;? Perhaps we could help you gather quite a few tons of usable data with DEiXTo! &lt;i&gt;Unless terms of use or copyright restrictions forbid it..&lt;/i&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-6296717665748183658?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/01/netnography-scraping.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-G4XEUIdlsVI/TxVgZWVXFJI/AAAAAAAAAHo/jrprte9TgGQ/s72-c/soc-icons.png" height="72" width="72" /><thr:total>1</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-6589159885559155952</guid><pubDate>Thu, 12 Jan 2012 22:03:00 +0000</pubDate><atom:updated>2012-01-28T18:41:44.050+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Geographic data</category><category domain="http://www.blogger.com/atom/ns#">Geo-location</category><category domain="http://www.blogger.com/atom/ns#">Yahoo PlaceFinder</category><category domain="http://www.blogger.com/atom/ns#">Web services</category><category domain="http://www.blogger.com/atom/ns#">Google Maps</category><title>Geo-location data, Yahoo! PlaceFinder &amp; Google Maps API</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div class="p1" style="text-align: justify;"&gt;Location-aware applications have known huge success over the last few years and geographic data have been used extensively in a wide variety of ways.&amp;nbsp;Meanwhile,&amp;nbsp;there are numerous places of interest out there, such as&amp;nbsp;shopping malls, airports, restaurants, museums, transit stations and for most of them their addresses are publicly available on the Web.&amp;nbsp;Therefore, you could use &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt;&amp;nbsp;(or a web data extraction tool of your choice) in order to scrape the desired location information for any points of interest and then postprocess it so as to produce geographic data for further use.&lt;/div&gt;&lt;div class="p1"&gt;&lt;a href="http://3.bp.blogspot.com/-p8TFGvAqodY/Tw4K8JV_B9I/AAAAAAAAAGc/OLsfdL69Bfc/s1600/yahoo.png" imageanchor="1" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"&gt;&lt;img border="0" height="40" src="http://3.bp.blogspot.com/-p8TFGvAqodY/Tw4K8JV_B9I/AAAAAAAAAGc/OLsfdL69Bfc/s200/yahoo.png" width="200" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;a href="http://developer.yahoo.com/geo/placefinder/" target="_blank"&gt;Yahoo! PlaceFinder&lt;/a&gt; is a great web service that supports world-wide geocoding of street addresses and place names. It allows developers to convert addresses and places into geographic coordinates (and vice versa). Thus, you can send an HTTP request with a street address to it and get the latitude and longitude back! It's amazing how well it works. Of course, the more complete and detailed the address, the more precise the coordinates returned.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;div style="text-align: justify;"&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp; &amp;nbsp; In the context of this post, we thought it would be nice, mostly for demonstration purposes, to build a map of&amp;nbsp;&lt;/span&gt;&lt;a href="http://en.wikipedia.org/wiki/Thessaloniki" style="text-align: left;" target="_blank"&gt;Thessaloniki&lt;/a&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp;museums using the&amp;nbsp;&lt;a href="http://code.google.com/apis/maps/documentation/javascript/" target="_blank"&gt;Google Maps API&lt;/a&gt;&amp;nbsp;and geo-location data generated with&amp;nbsp;&lt;/span&gt;Yahoo! PlaceFinder&lt;span style="text-align: left;"&gt;. The source of data for our demo was&amp;nbsp;&lt;/span&gt;&lt;a href="http://odysseus.culture.gr/index_en.html" style="text-align: left;" target="_blank"&gt;Odysseus&lt;/a&gt;&lt;span style="text-align: left;"&gt;, the WWW server of the Hellenic Ministry of Culture that provides a full list of Greek museums, monuments and&amp;nbsp;archaeological&amp;nbsp;sites.&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-cwodswb2NJc/Tw4NXEZQPZI/AAAAAAAAAGk/jwmqy6ux-_Y/s1600/odysseus.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="226" src="http://3.bp.blogspot.com/-cwodswb2NJc/Tw4NXEZQPZI/AAAAAAAAAGk/jwmqy6ux-_Y/s400/odysseus.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; So, we&amp;nbsp;searched for museums&amp;nbsp;located in the city of Thessaloniki (&lt;span style="text-align: left;"&gt;the second-largest city in Greece and the capital of the region of Central Macedonia)&lt;/span&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp;&lt;/span&gt;and&amp;nbsp;extracted&amp;nbsp;through&amp;nbsp;&lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt;&amp;nbsp;the street addresses&amp;nbsp;of the ten results returned. At the picture below you can see a sample screenshot from the "INFORMATION" section of the &lt;a href="http://www.lemmth.gr/c/portal_public/layout?p_l_id=1.2&amp;amp;setlanguage=en_US" target="_blank"&gt;Folk Art and Ethnological Museum of Macedonia and Thrace&lt;/a&gt;&amp;nbsp;Odysseus&amp;nbsp;&lt;a href="http://odysseus.culture.gr/h/1/eh155.jsp?obj_id=3273" target="_blank"&gt;detailed webpage&lt;/a&gt;&amp;nbsp;(from which the address of this specific museum was scraped):&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/--fSlOunxu3g/Tw85BFJ47JI/AAAAAAAAAHU/g63xtnfLpaA/s1600/lemm_odysseus.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="148" src="http://2.bp.blogspot.com/--fSlOunxu3g/Tw85BFJ47JI/AAAAAAAAAHU/g63xtnfLpaA/s400/lemm_odysseus.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp; &amp;nbsp; After capturing the name and location of each museum and exporting them to a simple tab delimited&amp;nbsp;&lt;/span&gt;&lt;span style="text-align: left;"&gt;text&amp;nbsp;&lt;/span&gt;&lt;span style="text-align: left;"&gt;file, we wrote a Perl script harnessing the&amp;nbsp;&lt;/span&gt;&lt;span style="text-align: left;"&gt;&lt;a href="http://search.cpan.org/~gray/Geo-Coder-PlaceFinder-0.05/lib/Geo/Coder/PlaceFinder.pm" target="_blank"&gt;Geo::Coder::PlaceFinder&lt;/a&gt;&amp;nbsp;CPAN module in order to automatically find their geo-location coordinates and create an XML output file containing all the necessary information (through &lt;a href="http://search.cpan.org/~josephw/XML-Writer-0.614/Writer.pm" target="_blank"&gt;XML::Writer&lt;/a&gt;). Part of this XML document is displayed right below:&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-AhOT0aCmUwQ/Tw8qEo8TDZI/AAAAAAAAAHE/wMyLry5C1YI/s1600/xml_museums.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="180" src="http://2.bp.blogspot.com/-AhOT0aCmUwQ/Tw8qEo8TDZI/AAAAAAAAAHE/wMyLry5C1YI/s400/xml_museums.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp; &amp;nbsp; After having all the metadata we needed in this XML file, we utilized the &lt;a href="http://code.google.com/apis/maps/documentation/javascript/" target="_blank"&gt;Google Maps JavaScript API v3&lt;/a&gt; and created a&amp;nbsp;&lt;a href="http://deixto.com/thessaloniki_museums_map.html" target="_blank"&gt;map&lt;/a&gt;&amp;nbsp;(centered on Thessaloniki)&amp;nbsp;displaying&amp;nbsp;all city museums! To accomplish that goal, we followed the helpful guidelines given in this &lt;a href="http://www.svennerberg.com/2009/07/google-maps-api-3-markers/" target="_blank"&gt;very informative post&lt;/a&gt;&amp;nbsp;about Google Maps markers and wrote a short script that parsed the XML contents (via &lt;a href="http://search.cpan.org/~shlomif/XML-LibXML-1.90/LibXML.pod" target="_blank"&gt;XML::LibXML&lt;/a&gt;) and produced a web page with the desired Google Map object embedded (including markers for each museum). Finally, t&lt;/span&gt;&lt;span style="text-align: left;"&gt;he&amp;nbsp;&lt;a href="http://deixto.com/thessaloniki_museums_map.html" target="_blank"&gt;end result&lt;/a&gt;&amp;nbsp;was pretty satisfying (after some extra manual effort to be absolutely honest):&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-peWujv1efr8/Tw8siES41nI/AAAAAAAAAHM/ujHAJZ-KwPw/s1600/Google_map_thessaloniki_museums.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://1.bp.blogspot.com/-peWujv1efr8/Tw8siES41nI/AAAAAAAAAHM/ujHAJZ-KwPw/s400/Google_map_thessaloniki_museums.png" width="388" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;span style="text-align: left;"&gt;&amp;nbsp; &amp;nbsp; This is kind of cool, isn't it? Of course, the same procedure could be applied in a larger scale (e.g. for creating a map of Greece with ALL museums or/and monuments available) or expanded to other points of interest (whatever you can imagine, from schools and educational institutions to cinemas, supermarkets, shops or bank ATMs). In conclusion, we think that the combination of &lt;a href="http://http%3B//deixto.com" target="_blank"&gt;DEiXTo&lt;/a&gt; with other powerful tools and technologies can sometimes yield&amp;nbsp;an innovative and hopefully useful outcome. Since you have the raw web data at your disposal (captured with DEiXTo), your imagination (and perhaps &lt;a href="http://deixto.blogspot.com/2011/12/robotstxt-access-restrictions.html" target="_blank"&gt;copyright restrictions&lt;/a&gt;) is the only limit!&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-6589159885559155952?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2012/01/geo-location-data-yahoo-placefinder.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-p8TFGvAqodY/Tw4K8JV_B9I/AAAAAAAAAGc/OLsfdL69Bfc/s72-c/yahoo.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-580092648260829667</guid><pubDate>Wed, 04 Jan 2012 07:57:00 +0000</pubDate><atom:updated>2012-01-28T18:57:32.721+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Geo-location</category><category domain="http://www.blogger.com/atom/ns#">Wikipedia</category><category domain="http://www.blogger.com/atom/ns#">Michelin Maps</category><category domain="http://www.blogger.com/atom/ns#">Wrappers</category><title>DEiXTo powers Michelin Maps and Guides!</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;One of the biggest success stories of &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; is that it was used a few months ago by the&amp;nbsp;&lt;a href="http://www.michelin.co.uk/travel" target="_blank"&gt;Maps and Guides&lt;/a&gt;&amp;nbsp;UK&amp;nbsp;division of Michelin in order to build a &lt;a href="http://www.michelinonline.co.uk/travel/france-coverage.htm" target="_blank"&gt;France gazetteer&lt;/a&gt; web application.&amp;nbsp;If you are going on holiday to France, probably you will need hotel and restaurant guides, maps, atlases and tourist guides relevant to where you are staying or the places you will visit. So, the free online Michelin database can help you find out which ones are for you.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The contribution of DEiXTo in the context of the implementation of this useful service was that it scraped &amp;nbsp;from&amp;nbsp;Wikipedia&amp;nbsp;geo-location data as well as other metadata fields for 36.000+ French communes. In France the smallest administrative region is the commune and Wikipedia happened to have all of this relevant information freely available!&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-d8GPQj-QLAA/TvmkphCGwnI/AAAAAAAAAFg/18S_NcpioEA/s1600/michelin_maps_and_guides.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="270" src="http://3.bp.blogspot.com/-d8GPQj-QLAA/TvmkphCGwnI/AAAAAAAAAFg/18S_NcpioEA/s640/michelin_maps_and_guides.png" width="570" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The &lt;a href="http://en.wikipedia.org/wiki/Lists_of_communes_of_France" target="_blank"&gt;starting target page&lt;/a&gt; contained a list of 95 (or so) departments, each of which containing a large number of communes.&amp;nbsp;Thus, every department detailed page would in turn list all its communes and their corresponding hyperlinks/ URLs. A sample department page looks like &lt;a href="http://en.wikipedia.org/wiki/Communes_of_the_Ain_department" target="_blank"&gt;this&lt;/a&gt;. And last, at a level below, we have the actual pages of interest with all the details needed about each commune. You can see a sample commune Wikipedia page by clicking&amp;nbsp;&lt;a href="http://en.wikipedia.org/wiki/L%27Abergement-Cl%C3%A9menciat" target="_blank"&gt;here&lt;/a&gt;&amp;nbsp;and a screenshot from it at the picture below. Meanwhile, this "scenario" also serves as a good example of collaborative wrappers where the output of a wrapper (a txt file with URLs) gets passed as input to a second one.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-DXditPGSsmQ/TvmnoWq3v7I/AAAAAAAAAFs/mLzv5DBYNbo/s1600/sample_commune.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="300" src="http://4.bp.blogspot.com/-DXditPGSsmQ/TvmnoWq3v7I/AAAAAAAAAFs/mLzv5DBYNbo/s400/sample_commune.png" width="177" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; It should be noted though that there were slight variations in the layout and structure of the target pages. However, the algorithm &lt;a href="http://deixto.com/" target="_blank"&gt;DEiXTo&lt;/a&gt; uses is quite efficient and robust and usually can deal with such cases. To be more specific, the scraper that was deployed, extracted from&amp;nbsp;each commune page the following metadata: region, department, arrondissement, canton and importantly the latitude and longitude.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The precision and recall that DEiXTo achieved with these commune pages was amazing (very close to 100%) and as a result the database was finally enriched with the large volumes of information captured. We are really happy that Michelin was able to&amp;nbsp;successfully&amp;nbsp;utilize DEiXTo and create a free and useful online service. So, if you plan a trip to France, you know where to find an informative online map/ guide! :)&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-580092648260829667?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/deixto-powers-michelin-maps-and-guides.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-d8GPQj-QLAA/TvmkphCGwnI/AAAAAAAAAFg/18S_NcpioEA/s72-c/michelin_maps_and_guides.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-7308533962503699448</guid><pubDate>Mon, 02 Jan 2012 12:15:00 +0000</pubDate><atom:updated>2012-01-28T18:11:34.748+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Agents</category><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Wrappers</category><title>Cooperating DEiXTo agents</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;Basically there are two major, broad categories of cooperating &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; wrappers. In the first one, the wrappers are executed and applied on the same, single page so as to capture bits of interest that are scattered all over this particular target page. On the other hand, the second category comprises cases where the output of a wrapper serves as input for a second one. For the latter, typically the output of the first wrapper is a txt file containing the target URLs leading to pages with detailed information.&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-eCBcxNJooAc/TvroeQwCd4I/AAAAAAAAAGU/vWU78D6fv6w/s1600/deixto-agents.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="90" src="http://2.bp.blogspot.com/-eCBcxNJooAc/TvroeQwCd4I/AAAAAAAAAGU/vWU78D6fv6w/s400/deixto-agents.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The first category is &lt;i&gt;not supported directly&lt;/i&gt; by the GUI tool. However, DEiXToBot&amp;nbsp;&amp;nbsp;&lt;span class="Apple-style-span" style="text-align: left;"&gt;(a &lt;a href="http://search.cpan.org/~kntonas/WWW-Mechanize-Sleepy-0.7/Sleepy.pm"&gt;Mechanize&lt;/a&gt; agent object capable of executing extraction rules&amp;nbsp;previously&amp;nbsp;built with the GUI tool)&amp;nbsp;&lt;/span&gt;allows the combination of multiple extraction rules/ patterns on the same page and their results through Perl code. So, if you have come across a complex, data-rich page and you are fluent with Perl and DEiXToBot's interface, you can build the necessary tree patterns separately with the GUI tool and then write a highly efficient set of cooperating Perl robots aiming at capturing all the desired data. It is not easy though since it requires programming skills and custom code.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; As far as the second type of collaboration is concerned, we have&amp;nbsp;stumbled&amp;nbsp;upon numerous cases where a first wrapper collects the detailed target URLs from listing pages and passes them to a second wrapper which in turn takes over and gathers all data of interest from the pages containing the full text/ description. A typical case would be a blog or a news site or an e-shop, where a first agent could scrape the URLs of the detailed pages and a second one would visit each one of them extracting every single piece of desired information. If you wonder how you can set a DEiXTo wrapper to visit multiple target pages, this can be done either through a text file containing their addresses or via a list. Both ways can be specified in the Project Info tab of the DEiXTo GUI tool.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;/div&gt;&lt;div&gt;&lt;div class="separator" style="clear: both; font-family: arial, helvetica, sans-serif; line-height: 19px; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-4eAiAG6A0J0/TvbTKmr_xZI/AAAAAAAAAFU/7GRQDIrV7PY/s1600/targets_box.png" imageanchor="1" style="background-color: white; margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="189" src="http://2.bp.blogspot.com/-4eAiAG6A0J0/TvbTKmr_xZI/AAAAAAAAAFU/7GRQDIrV7PY/s200/targets_box.png" width="200" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: justify;"&gt;&lt;span class="Apple-style-span" style="font-family: arial, helvetica, sans-serif;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;Moreover, for the first wrapper which is intended to scrape&amp;nbsp;the URLs, you only have to create a pattern that locates the links towards the detailed pages. Usually this is easy and straightforward. You should just point at a representative link,&amp;nbsp;use it as a record instance and set the &lt;i style="text-align: left;"&gt;A&lt;/i&gt; rule node as "checked" (right click on the &lt;i style="text-align: left;"&gt;A&lt;/i&gt;&lt;span class="Apple-style-span" style="text-align: left;"&gt; node and select "Match and Extract Content"). The resulting pattern will be something like this:&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-W1ViTnb3DGY/TvqpLiy_9NI/AAAAAAAAAF4/zwz81d-OSBg/s1600/sample_A_pattern.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-W1ViTnb3DGY/TvqpLiy_9NI/AAAAAAAAAF4/zwz81d-OSBg/s1600/sample_A_pattern.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Then, via executing the rule you can extract the "&lt;i&gt;href&lt;/i&gt;" attribute (essentially the URI) of each matching link and export the results to a txt file, say target_urls.txt, which subsequently will be fed to the next wrapper.&amp;nbsp;Please note that if you provide just the &lt;i&gt;A&lt;/i&gt; rule node as a pattern, you will capture ALL the hyperlinks found on the page but we guess you don't want that (we want &lt;i&gt;only&lt;/i&gt; those leading to the detailed pages).&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp;In conclusion, &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; can power schemes of&amp;nbsp;cooperative robots and achieve very high precision. Especially for more advanced cases, synergies of multiple wrappers are always needed. Their coordination though usually needs some careful&amp;nbsp;thought&amp;nbsp;and effort. Should you have any questions, please do not hesitate to &lt;a href="http://deixto.com/contact.php"&gt;contact us&lt;/a&gt;!&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-7308533962503699448?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/cooperating-deixto-agents.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-eCBcxNJooAc/TvroeQwCd4I/AAAAAAAAAGU/vWU78D6fv6w/s72-c/deixto-agents.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-5272203444742284637</guid><pubDate>Tue, 27 Dec 2011 22:33:00 +0000</pubDate><atom:updated>2012-01-28T18:04:29.733+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Agents</category><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Robots.txt</category><title>Robots.txt &amp; access restrictions</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;A really serious matter that often many people ignore (deliberately or not) is the access and copyright restrictions that several website owners/ administrators impose. A lot of websites want robots&amp;nbsp;entirely&amp;nbsp;out.&amp;nbsp;The method they use to keep &lt;b&gt;&lt;i&gt;cooperating&lt;/i&gt;&lt;/b&gt; web robots out of certain site content&amp;nbsp;is a &lt;a href="http://www.robotstxt.org/"&gt;robots.txt&lt;/a&gt; file that resides in their root directory and functions as a request that&amp;nbsp;visiting bots ignore specified files or directories.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-7HJiEy3-dgg/TvWs1GqKMHI/AAAAAAAAAE8/zaZjqYwSjAY/s1600/robotstxtwrap.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-7HJiEy3-dgg/TvWs1GqKMHI/AAAAAAAAAE8/zaZjqYwSjAY/s1600/robotstxtwrap.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;For example, the next 2 lines indicate that robots should not visit any pages on the site:&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;User-agent: *&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;Disallow: /&lt;/span&gt;&lt;br /&gt;
&amp;nbsp; &amp;nbsp; However, a large number of scraping agents violate the restrictions set by the vendors and content providers. This is a very important issue and it raises significant legal concerns.&amp;nbsp;Undoubtedly, there has been an ongoing raging war between bots and websites with strict terms of use. The latter deploy various technical measures to stop robots (an excellent white paper about detecting and blocking site scraping attacks is &lt;a href="http://www.imperva.com/docs/WP_Detecting_and_Blocking_Site_Scraping_Attacks.pdf" target="_blank"&gt;here&lt;/a&gt;) and sometimes even take legal action and resort to courts. There have been many cases over the last years with contradictory decisions. So, the whole issue is quite unclear.&lt;span class="Apple-style-span" style="text-align: left;"&gt;&amp;nbsp;&lt;/span&gt;You can read more about it in the relevant section of the &lt;a href="http://en.wikipedia.org/wiki/Web_scraping#Legal_issues"&gt;"Web scraping"&lt;/a&gt; Wikipedia article.&amp;nbsp;Both sides have their arguments, so it's not at all an easy verdict.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; The &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; command line executor by default respects the robots.txt file of potential target websites (through the use of the&amp;nbsp;&lt;a href="http://search.cpan.org/~gaas/WWW-RobotRules-6.01/lib/WWW/RobotRules.pm"&gt;WWW::RobotRules&lt;/a&gt;&amp;nbsp;Perl module). Nevertheless, you can override this configuration (at your own risk!) by setting the -nice parameter to 0. It is strongly&amp;nbsp;recommended&amp;nbsp;though that you comply with&amp;nbsp;webmasters' requests and keep out of pages that have access restrictions.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Generally speaking, data &lt;a href="http://en.wikipedia.org/wiki/Copyright"&gt;copyright&lt;/a&gt; is a &lt;b&gt;&lt;i&gt;HUGE&lt;/i&gt;&lt;/b&gt; issue, especially in today's Web 2.0 era, and has sparked endless discussions and spawned numerous articles, opinions, licenses, disputes and legitimacy issues.&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-_DXPbX5G4q0/Tvq2dzNY6dI/AAAAAAAAAGI/q95DrD9TYRg/s1600/150px-Copyright.svg.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="60" src="http://4.bp.blogspot.com/-_DXPbX5G4q0/Tvq2dzNY6dI/AAAAAAAAAGI/q95DrD9TYRg/s200/150px-Copyright.svg.png" width="60" /&gt;&lt;/a&gt;&lt;/div&gt;&amp;nbsp; &amp;nbsp; By the way, it is worth mentioning that currently there is a strong movement in favor of openness in data, standards and software. And according to many, openness fosters innovation and promotes&amp;nbsp;transparency&amp;nbsp;and&amp;nbsp;collaboration.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&amp;nbsp; &amp;nbsp; Finally, we would like to suggest to everyone using&amp;nbsp;web data extraction tools to comply with the terms of use that the websites set and think twice before deploying a scraper, especially if the data is going to be used for commercial purposes. A good practice is to contact the webmaster and ask for permission accessing and using their content. Quite a few times the website might be interested in such a cooperation mostly for marketing and advertising reasons.&amp;nbsp;So, as soon as you get a "green light", start building your scraper with &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt;&amp;nbsp;and we are here to help you!&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-5272203444742284637?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/robotstxt-access-restrictions.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-7HJiEy3-dgg/TvWs1GqKMHI/AAAAAAAAAE8/zaZjqYwSjAY/s72-c/robotstxtwrap.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-2863751430269207301</guid><pubDate>Sun, 25 Dec 2011 09:34:00 +0000</pubDate><atom:updated>2012-01-28T18:36:13.126+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">wget</category><category domain="http://www.blogger.com/atom/ns#">Downloading</category><title>Downloading images with DEiXTo and wget</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white;"&gt;&lt;span class="Apple-style-span" style="font-family: arial, helvetica, sans-serif;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt;Many people often download pictures and photos from various websites of interest. Sometimes though, the number of images that someone wants to download from certain pages is large. So large that doing it manually is almost&amp;nbsp;prohibitive.&amp;nbsp;Therefore, an automation tool is often needed to save users time and repetitive effort. Of course, towards this goal, &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; can help.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp; Let's suppose that you want to get all images from a specific web page (with respect to terms of use). You can easily build a simple extraction rule by pointing at an image, using it as a record instance and setting the IMG rule node as "checked" (right click on the IMG node and select "Match and Extract Content"). The resulting pattern will be like this:&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-5MdHMGgZHZw/TvbQgqfD8dI/AAAAAAAAAFI/mvPdP7JieKw/s1600/img_pattern.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em; text-align: center;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-5MdHMGgZHZw/TvbQgqfD8dI/AAAAAAAAAFI/mvPdP7JieKw/s1600/img_pattern.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp; Then, via executing the rule you can extract the "src" attribute (essentially the URI) of each image found on the page and export the results to a txt file, let's say image_urls.txt. And last, you can use&amp;nbsp;&lt;/span&gt;&lt;span class="il" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;&lt;a href="http://www.gnu.org/software/wget/"&gt;GNU Wget&lt;/a&gt;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;&amp;nbsp;(a great free command line tool) in order to retrieve th&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;e files. You can download a Windows (win32) version of wget&amp;nbsp;&lt;/span&gt;&lt;a href="http://users.ugent.be/~bpuype/wget/" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;here&lt;/a&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;.&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;For example, on Windows you can then just open a DOS command prompt window, change the current working directory to the folder wget is stored (via the 'cd' command) and enter:&lt;/span&gt;&lt;br /&gt;
&lt;strong style="background-color: white; line-height: 19px; text-align: -webkit-auto;"&gt;&lt;strong&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;&lt;span class="il"&gt;wget&lt;/span&gt;.exe -i image_urls.txt&amp;nbsp;&lt;/span&gt;&lt;/strong&gt;&lt;/strong&gt;&lt;br /&gt;
&lt;strong style="background-color: white; line-height: 19px; text-align: -webkit-auto;"&gt;&lt;span class="Apple-style-span" style="font-family: arial, helvetica, sans-serif; font-weight: normal;"&gt;where image_urls.txt is the file containing the URIs of images.&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, helvetica, sans-serif; font-weight: normal;"&gt;And voilà! The wget utility will download all the images of the target page for you!&lt;/span&gt;&lt;/strong&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp; What about getting images from multiple pages? You will have to explicitly provide the target URLs either through an input txt file or via a list. Both ways can be specified in the Project Info tab of the &lt;/span&gt;&lt;a href="http://deixto.com/" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;DEiXTo&lt;/a&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt; GUI tool.&lt;/span&gt;&lt;/div&gt;
&lt;div style="font-family: arial, helvetica, sans-serif; line-height: 19px; text-align: justify;"&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-4eAiAG6A0J0/TvbTKmr_xZI/AAAAAAAAAFU/7GRQDIrV7PY/s1600/targets_box.png" imageanchor="1" style="background-color: white; margin-left: 1em; margin-right: 1em; text-align: center;"&gt;&lt;img border="0" height="189" src="http://2.bp.blogspot.com/-4eAiAG6A0J0/TvbTKmr_xZI/AAAAAAAAAFU/7GRQDIrV7PY/s200/targets_box.png" width="200" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div class="separator" style="clear: both; font-family: arial, helvetica, sans-serif; line-height: 19px; text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white;"&gt;&amp;nbsp; &amp;nbsp; Thus, if you have the target URLs at hand or you can extract them with another wrapper (generating a txt file), then &amp;nbsp;you can just pass them as input to the new image wrapper and the latter will do the laborious work for you.&lt;/span&gt;&lt;/div&gt;
&lt;div style="font-family: arial, helvetica, sans-serif; line-height: 19px; text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: normal;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp; In case all the above are a bit unclear, we have built a &lt;/span&gt;&lt;/span&gt;&lt;a href="http://deixto.com/imdb_starwars.wpf" style="background-color: white;"&gt;sample wrapper project file&lt;/a&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: normal;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt; (imdb_starwars.wpf) that downloads all Star Wars (1977) thumbnail photos from the corresponding &lt;/span&gt;&lt;/span&gt;&lt;a href="http://www.imdb.com/title/tt0076759/mediaindex" style="background-color: white;"&gt;imdb page&lt;/a&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: normal;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt;. Please note that we set the agent to follow the Next page link so as to gather all thumbnails since they are scattered across multiple pages. However, if you would like to get the large size photos you will have to add another scraping layer for extracting the links of the pages containing the full size pictures.&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style="text-align: justify;"&gt;
&lt;span class="Apple-style-span" style="background-color: white;"&gt;&lt;span class="Apple-style-span" style="font-family: arial, helvetica, sans-serif;"&gt;&lt;span class="Apple-style-span" style="line-height: 19px;"&gt;&amp;nbsp; &amp;nbsp; Anyway, in order to run the sample wrapper for the&amp;nbsp;thumbnails,&amp;nbsp;you should open the wpf (through the Open button in the Project Info tab) and then press the "Go!" button. Or alternatively you can use the command line executor instead on a DOS prompt:&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;br /&gt;
&lt;b style="background-color: white; font-family: 'Courier New', Courier, monospace; line-height: 19px;"&gt;deixto_executor.exe&amp;nbsp;imdb_starwars.wpf&lt;/b&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;Finally, you will have to pass the image_urls.txt output file to wget in order to download all thumbnails and get the job done! &lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, helvetica, sans-serif; line-height: 19px;"&gt;May the Force be with you! :)&lt;/span&gt;&lt;/div&gt;
&lt;span class="Apple-style-span" style="background-color: white;"&gt;
&lt;/span&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-2863751430269207301?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/downloading-images-with-deixto-and-wget.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-5MdHMGgZHZw/TvbQgqfD8dI/AAAAAAAAAFI/mvPdP7JieKw/s72-c/img_pattern.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-1723411798346401639</guid><pubDate>Thu, 22 Dec 2011 19:11:00 +0000</pubDate><atom:updated>2012-01-28T18:47:54.709+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Tech Box</category><category domain="http://www.blogger.com/atom/ns#">Mobile apps</category><category domain="http://www.blogger.com/atom/ns#">Mobile devices</category><category domain="http://www.blogger.com/atom/ns#">XML</category><category domain="http://www.blogger.com/atom/ns#">iPhone simulator</category><category domain="http://www.blogger.com/atom/ns#">Data transformations</category><category domain="http://www.blogger.com/atom/ns#">XSLT</category><title>Can DEiXTo power mobile apps? Yes, it can!</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;Web content scraped with &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; can be presented in a wide variety of formats. However, the most common choice is p&lt;/span&gt;&lt;span style="background-color: white; font-family: arial, sans-serif;"&gt;robably&lt;/span&gt;&lt;span style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;XML since it facilitates heavy post-processing and further transformations so as to make the data suit your needs.&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;A potentially interesting scenario would be to output bits of interest from a target website into an XML file and then transform it to HTML through &lt;/span&gt;&lt;a href="http://www.w3schools.com/xsl/" style="background-color: white; font-family: arial, sans-serif;"&gt;XSLT&lt;/a&gt;&amp;nbsp;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;(Extensible Stylesheet Language Transformations).&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;This could be very practical and useful for creating in real time a customized, "shortened" version of a target web page specifically for mobile devices (e.g. Android and iOS devices).&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp; &amp;nbsp; As you all know smartphones and tablets over the last few years have changed the computer world.&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;So, we thought it would be challenging and hopefully useful to build a web service capable of&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;repurposing specified pages&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;on the fly&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;(&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;through the use of&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;a &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt;-based agent)&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;, keeping only the important/ interesting stuff and&amp;nbsp;&lt;/span&gt;&lt;span style="background-color: white; font-family: arial, sans-serif;"&gt;returning it in a mobile-compatible fashion,&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;suitable to&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;fit small screens by harnessing&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;XML, XSLT and CSS. We did &lt;i&gt;not&lt;/i&gt;&amp;nbsp;fully implement the service but we got a simple &lt;i&gt;prototype&lt;/i&gt; ready to try our idea. And the results were quite encouraging!&lt;/span&gt;&lt;br /&gt;
&lt;div style="text-align: center;"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-UuXYDHO-H5E/TvF3HNRSEPI/AAAAAAAAAEw/fdT1SgiVTf0/s1600/techbox_deixto.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="400" src="http://3.bp.blogspot.com/-UuXYDHO-H5E/TvF3HNRSEPI/AAAAAAAAAEw/fdT1SgiVTf0/s400/techbox_deixto.png" width="205" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp; &amp;nbsp; For the needs of our demo we used&amp;nbsp;&lt;/span&gt;&lt;a href="http://www.the-techbox.com/" style="background-color: white; font-family: arial, sans-serif;"&gt;Tech Box&lt;/a&gt;,&amp;nbsp;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;a popular technology news blog&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;covering a plethora of&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;interesting&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;and fun topics around&lt;/span&gt;&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp;the IT industry. Tech Box currently has a remarkable &lt;a href="http://itunes.apple.com/us/app/tech.box/id484597892?ls=1&amp;amp;mt=8"&gt;iPhone app&lt;/a&gt; available allowing the user to browse through the latest news as well as save and share favorite stories. The tech.box mobile app, built by &lt;a href="http://nsloom.com/"&gt;NSLoom&lt;/a&gt;&amp;nbsp;(an innovative start-up creating seamless iOS applications), is really awesome. However, there is always room for improvement and often web scraping has something new to offer and extend current functionality.&amp;nbsp;&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="background-color: white; font-family: arial, sans-serif;"&gt;&amp;nbsp; &amp;nbsp; In the context of the demo, we supposed that we wanted to search the Tech Box site with &lt;a href="http://www.the-techbox.com/?s=iPhone&amp;amp;lang=en"&gt;"iPhone" as a keyword&lt;/a&gt;.&amp;nbsp;&lt;/span&gt;&lt;span style="background-color: white; font-family: arial, sans-serif;"&gt;So, we built a quick (and dirty) test scraper able to extract all the records found on a result page and generate an XML document with the data captured. With the use of an elegant XSLT and a CSS we achieved a nice, usable and easy to navigate structure, suitable for a smartphone screen (illustrated in the picture above).&amp;nbsp;&lt;/span&gt;&lt;span style="background-color: white; font-family: arial, sans-serif;"&gt;You can see &lt;b&gt;&lt;i&gt;&lt;a href="http://deixto.com/mobile/techbox.html"&gt;live&lt;/a&gt;&lt;/i&gt;&lt;/b&gt; how the output XML file (containing 10 sample&amp;nbsp;headlines) looks like on an online iPhone simulator at the following address:&lt;/span&gt;&lt;br /&gt;
&lt;div style="text-align: center;"&gt;&lt;span style="font-family: 'Courier New', Courier, monospace;"&gt;&lt;a href="http://deixto.com/mobile/techbox.html"&gt;&lt;b&gt;http://deixto.com/mobile/techbox.html&lt;/b&gt;&lt;/a&gt;&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;&amp;nbsp; &amp;nbsp; The concept of the proposed web service is the following: suppose that you are&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;an app developer or&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;a website owner/ administrator and that you need to display content inside your app or the mobile version of your site, either from a website of your control (meaning that you have access to its backend) or from another, "external" site (with respect to copyright and access restrictions). Often, though, it's not easy to retrieve data from the target website or simply you don't know how to do it.. Therefore, a service that could listen to requests for certain pages/ URIs and return their important data in a suitable form could potentially be very useful. For example, ideally an HTTP request like &lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;http://deixto.com/webservice.pl?uri="http://example.com/.."&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;&amp;nbsp;would result in a good-looking XML chunk (formatted with XSLT and CSS) containing the data scraped from the original page (specified with the &lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;uri&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt; parameter)&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;.&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: arial, sans-serif;"&gt;&amp;nbsp; &amp;nbsp; Finally, we would like to bring forward the fact that &lt;a href="http://deixto.com/"&gt;DEiXToBot&lt;/a&gt;&amp;nbsp;contains best of breed Perl technology and allows extensive customization. Thus, it facilitates tailor-made solutions so as to make the data captured fully fit your project's needs. And towards this direction, deploying XSLT and XML-related technologies in general can really boost the utility and value of scraping and DEiXTo in particular!&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-1723411798346401639?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/can-deixto-power-mobile-apps-yes-it-can.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-UuXYDHO-H5E/TvF3HNRSEPI/AAAAAAAAAEw/fdT1SgiVTf0/s72-c/techbox_deixto.png" height="72" width="72" /><thr:total>2</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-6762757001021784194</guid><pubDate>Sat, 17 Dec 2011 09:20:00 +0000</pubDate><atom:updated>2012-01-28T18:06:37.573+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">APIs</category><category domain="http://www.blogger.com/atom/ns#">PDF Downloader</category><category domain="http://www.blogger.com/atom/ns#">Διαύγεια</category><category domain="http://www.blogger.com/atom/ns#">ΥπερΔιαύγεια</category><title>APIs vs Scraping - Cl@rity &amp; Yperdiavgeia</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;Typically there are two main&amp;nbsp;mechanisms&amp;nbsp;to search and retrieve data from a website: either through an &amp;nbsp;Application Programming Interface commonly known as an API (if available) or via screen scraping. The first one is better, faster and more reliable. However, there is&amp;nbsp;&amp;nbsp;not always&amp;nbsp;a search API &amp;nbsp;available or perhaps even if there exists one, it may not&amp;nbsp;fully&amp;nbsp;cover your needs. In such cases, web robots, also called agents, are usually used in order to simulate a person searching the target website/ online database through a web browser&amp;nbsp;and capture bits of interest by&amp;nbsp;utilizing&amp;nbsp;scraping techniques.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;An API that has attracted some attention over the last few months in Greece is the &lt;a href="http://opendata.diavgeia.gov.gr/"&gt;Opendata API&lt;/a&gt; that the &lt;a href="http://diavgeia.gov.gr/en"&gt;"Cl@rity" program&lt;/a&gt; ("Διαύγεια" in Greek) is offering. Since the 1st of October 2010, all Greek Ministries are obliged to upload their decisions and expenditure on the Internet, through the Cl@rity program. Cl@rity is one of the major transparency initiatives of the &lt;a href="http://www.ypes.gr/"&gt;Ministry of Interior, Decentralization and e-Government&lt;/a&gt;. Each document uploaded is digitally signed and given a transaction unique number automatically by the system.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-7s0S8T059-o/TuhRxrJ1gyI/AAAAAAAAAD0/bwetIYwdMIw/s1600/diavgeia_logo.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-7s0S8T059-o/TuhRxrJ1gyI/AAAAAAAAAD0/bwetIYwdMIw/s1600/diavgeia_logo.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;The &lt;a href="http://opendata.diavgeia.gov.gr/"&gt;Opendata API&lt;/a&gt; offers a variety of search parameters such as organization, type, tag (subject), ada (the unique number assigned), signer and date. However, there are still a lot of parameters and functionality missing such as full text search as well as searching by certain criteria like beneficiary's name, VAT registration number (ΑΦΜ in Greek), document title and other metadata fields.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;A remarkable alternative for searching effectively through the documents of the&amp;nbsp;Greek&amp;nbsp;public organizations is &lt;a href="http://yperdiavgeia.gr/"&gt;yperdiavgeia.gr&lt;/a&gt;&amp;nbsp;("ΥπερΔιαύγεια" in Greek), a web-based platform built by the expert in digital libraries and institutional repositories&amp;nbsp;&lt;a href="http://vbanos.gr/"&gt;Vangelis Banos&lt;/a&gt;. Yperdiavgeia is a mirror site of Cl@rity that gets updated on a daily basis and it provides a powerful and robust&amp;nbsp;&lt;a href="http://yperdiavgeia.gr/docs/opensearch"&gt;OpenSearch API&lt;/a&gt;&amp;nbsp;which is far more usable and easy to harness.&amp;nbsp;Its great advantage is that it facilitates full text searching.&amp;nbsp;Currently, it lacks some parameters support but it seems that they are going to be added soon since it is under active development.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-tEiXt6bNqUY/TuxU3WbwrRI/AAAAAAAAAEA/G1Da1hcKl40/s1600/yperdiavgeia_sample.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="146" src="http://3.bp.blogspot.com/-tEiXt6bNqUY/TuxU3WbwrRI/AAAAAAAAAEA/G1Da1hcKl40/s400/yperdiavgeia_sample.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;Even though both APIs mentioned above are really remarkable (especially for communicating and exchanging data with third party programs) there is still some room for&amp;nbsp;utilizing&amp;nbsp;scraping techniques and coming up with some "magic". In a &lt;a href="http://deixto.blogspot.com/2011/06/pdf.html"&gt;previous post&lt;/a&gt; we had described in detail an application we developed mostly for downloading a user-specified number of the latest PDF documents of a specific organization uploaded to Cl@rity. We believe that this little &lt;a href="http://deixto.com/diavgeia-downloader.zip"&gt;utility&lt;/a&gt; we created&amp;nbsp;(offering&amp;nbsp;both a GUI as well as a command line version)&amp;nbsp;can be quite useful for many people working in the public sector and potentially save a lot of time and effort. For further information about it, please check out &lt;a href="http://deixto.blogspot.com/2011/06/pdf.html"&gt;this post&lt;/a&gt;&amp;nbsp;(although it is written in Greek).&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
So, in this short post we just wanted to point out that there are quite a lot of great APIs&amp;nbsp;out there, provided mostly by large organizations (e.g., firms,&amp;nbsp;governments,&amp;nbsp;cultural institutions and digital libraries - collections) as well as the major players of the IT industry such as &lt;a href="http://code.google.com/more/table/"&gt;Google&lt;/a&gt;, &lt;a href="http://aws.amazon.com/"&gt;Amazon&lt;/a&gt;, etc, offering amazing features and functionality. Nevertheless, scraping the native web interface of a target site can still be useful and sometimes come up with a solution that overpasses difficulties or/and inefficiencies of APIs and yield an innovative outcome. Moreover, there are numerous websites that do not offer an API, thus a scraper could perhaps be deployed in case data&amp;nbsp;searching, gathering or&amp;nbsp;exporting&amp;nbsp;would be needed. Therefore, the "battle" between APIs and scraping still rages.. and we are eager to see how things will evolve. Truth be told, we love them both!&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-6762757001021784194?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/apis-vs-scraping-clrity-yperdiavgeia.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-7s0S8T059-o/TuhRxrJ1gyI/AAAAAAAAAD0/bwetIYwdMIw/s72-c/diavgeia_logo.jpg" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-6026838091513190758</guid><pubDate>Tue, 13 Dec 2011 21:23:00 +0000</pubDate><atom:updated>2012-01-28T19:06:33.851+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">DSpace</category><category domain="http://www.blogger.com/atom/ns#">APIs</category><category domain="http://www.blogger.com/atom/ns#">Scraping</category><category domain="http://www.blogger.com/atom/ns#">Institutional repositories</category><category domain="http://www.blogger.com/atom/ns#">OAI-PMH</category><category domain="http://www.blogger.com/atom/ns#">Dublin Core</category><title>DSpace &amp; Institutional Repositories</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;Institutional Repositories (IRs) have emerged over the last few years and became very popular in the academic library world. The system that has dominated the "market" is &lt;a href="http://www.dspace.org/" target="_blank"&gt;DSpace&lt;/a&gt;, an exciting, functionality-rich, open source software package that is installed at over &lt;a href="http://www.dspace.org/whos-using-dspace" target="_blank"&gt;1.100&lt;/a&gt; institutions around the globe. It offers an &lt;a href="http://www.openarchives.org/OAI/openarchivesprotocol.html" target="_blank"&gt;OAI-PMH&lt;/a&gt; web service for harvesting the metadata of&amp;nbsp;the repository&amp;nbsp;and getting its entire contents in Dublin Core format. However, OAI-PMH does not provide advanced search by certain criteria such as title, author, supervisor, etc.&amp;nbsp;Even the &lt;a href="https://wiki.duraspace.org/display/DSPACE/REST+API" target="_blank"&gt;REST API&lt;/a&gt; which is under construction does not facilitate searching with these metadata fields, at least to the best of our knowledge. Moreover, the default &lt;a href="https://jira.duraspace.org/secure/attachment/10740/opensearch.txt" target="_blank"&gt;DSpace&amp;nbsp;OpenSearch&lt;/a&gt;&amp;nbsp;support still seems incomplete and a bit buggy. Therefore, a potential solution for searching in real time a DSpace repository could be submitting a query and scraping the results returned through its native web interface. This could probably be useful for building a federated search engine or perhaps for creating a mobile app (currently there is no mobile version for DSpace).&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Having in mind the lack of an "advanced", multiple-criteria enabled, search&amp;nbsp;API /&amp;nbsp;mechanism by DSpace, we thought it would be interesting and perhaps useful to write a test scraper that could submit queries to a DSpace repository and fetch the search results through its website. So, we built a&amp;nbsp;simple,&amp;nbsp;DOM-based, extraction rule (wrapper) with the DEiXTo GUI tool and then wrote a short DEiXToBot-based script that&amp;nbsp;submits a sample query&amp;nbsp;to&amp;nbsp;&lt;a href="http://dspace.lib.uom.gr/" target="_blank"&gt;Psepheda&lt;/a&gt;&amp;nbsp;(the IR of University of Macedonia) and scrapes the results returned. The following picture illustrates the 10 first results for a sample query by title.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-rlPibLT00eM/Tueg7pK3mYI/AAAAAAAAADs/nfUlqs8_1gk/s1600/dspace_title_search_results.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="299" src="http://3.bp.blogspot.com/-rlPibLT00eM/Tueg7pK3mYI/AAAAAAAAADs/nfUlqs8_1gk/s1600/dspace_title_search_results.png" width="515" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;To get a better idea of how a Perl, DEiXToBot-based script works, below you can find the code that scrapes the 10 first items containing "programming" in title. The pattern used captures five metadata fields: detailed URL, title, date, authors and supervisor, and prints them on the screen. Of course this script can be easily extended to submit user specified queries as well as navigate through all the result pages by following the Next page link ("επόμενη" is the inner text of this link in Greek).&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;use DEiXToBot;&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;use Encode;&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;my $agent = DEiXToBot-&amp;gt;new();&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;$agent-&amp;gt;get('http://dspace.lib.uom.gr/simple-search?query=((title:programming))');&lt;/span&gt;&lt;br /&gt;
&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;$agent-&amp;gt;load_pattern('dspace_pattern.xml');&lt;/span&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;$agent-&amp;gt;ignore_tags( [ 'em' ] );&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;$agent-&amp;gt;build_dom();&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;$agent-&amp;gt;extract_content();&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;for my $record (@{$agent-&amp;gt;records}) {&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;&amp;nbsp; &amp;nbsp; print encode_utf8(join("\n",@{$record})),"\n\n";&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace; font-size: x-small;"&gt;}&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div&gt;&lt;div style="text-align: justify;"&gt;DEiXToBot is written is Perl, thus it is portable and can run on multiple operating systems provided you have all the prerequisite Perl modules installed.&amp;nbsp;You can download the lines of code given above along with the necessary pattern by clicking &lt;a href="http://deixto.com/dspace_deixto.zip"&gt;here&lt;/a&gt;. This short script serves as a good, simple example for&amp;nbsp;utilizing&amp;nbsp;the power and flexibility of DEiXToBot (a Mechanize agent object, essentially a browser emulator, which is able to execute patterns/ extraction rules&amp;nbsp;previously&amp;nbsp;built with the GUI tool).&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;Generally, IRs have huge potential and in the next few years they are expected to play an&amp;nbsp;increasingly&amp;nbsp;important role in storing and preserving digital content, academic or not.&amp;nbsp;By the way, a great federated search engine harvesting numerous Greek digital libraries and institutional repositories is &lt;a href="http://openarchives.gr/" target="_blank"&gt;openarchives.gr&lt;/a&gt;&amp;nbsp;which is&amp;nbsp;mostly based upon OAI-PMH. It harnesses innovative technologies and has grown a lot since 2006 when it was first launched.&lt;br /&gt;
&lt;br /&gt;
Last but not least, DEiXTo was used quite long ago by "&lt;a href="http://pantou.lib.uom.gr/" target="_blank"&gt;Pantou&lt;/a&gt;", the federated search engine of the University of Macedonia, in order to scrape (in real time) multiple online resources simultaneously via their web interface/ site. It is worth noting that a &lt;a href="http://lib-code.lib.sfu.ca/projects/dbwiz/browser/trunk/DBWIZ_search/lib/DBWIZ/Search/Internet/DEiXTo.pm?rev=691" target="_blank"&gt;predecessor&lt;/a&gt; of the current DEiXToBot module,&amp;nbsp;back in 2007,&amp;nbsp;was included in the official &lt;a href="http://researcher.sfu.ca/dbwiz" target="_blank"&gt;dbWiz&lt;/a&gt;&amp;nbsp;distribution, a remarkable, open source, federated search&amp;nbsp;software&amp;nbsp;package&amp;nbsp;upon which pantou was built.&lt;/div&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-6026838091513190758?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/12/dspace-irs.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-rlPibLT00eM/Tueg7pK3mYI/AAAAAAAAADs/nfUlqs8_1gk/s72-c/dspace_title_search_results.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-8311653756313934665</guid><pubDate>Mon, 28 Nov 2011 15:50:00 +0000</pubDate><atom:updated>2012-01-28T18:03:46.528+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">myVisitPlanner</category><title>myVisitPlanner &amp; DEiXTo</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;We are very happy to announce that DEiXTo is going to power myVisitPlanner, an exciting&amp;nbsp;project funded by the Greek Ministry of Education and Lifelong Learning, under the national action "COOPERATION 2011". myVisitPlanner&amp;nbsp;is coordinated by the &lt;a href="http://www.uom.gr/index.php?newlang=eng"&gt;University of Macedonia&lt;/a&gt; and it is&amp;nbsp;aiming at&amp;nbsp;creating a personalized system for cultural&amp;nbsp;itineraries&amp;nbsp;planning. The&amp;nbsp;consortium&amp;nbsp;participants include &lt;a href="http://www.ceti.gr/index2.php?lang=en"&gt;Athena - Research and Innovation Center&lt;/a&gt;, &lt;a href="http://www.gnomon.com.gr/"&gt;GNOMON Informatics SA&lt;/a&gt;, the &lt;a href="http://www.emthrace.org/en/"&gt;Ethnological Museum of Thrace&lt;/a&gt; and the &lt;a href="http://www.anko.gr/"&gt;West Macedonia Development Company (ANKO) SA&lt;/a&gt;.&amp;nbsp;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;DEiXTo-based wrappers are going to be deployed in order to&amp;nbsp;automatically&amp;nbsp;retrieve regional cultural&amp;nbsp;itineraries and points&amp;nbsp;of&amp;nbsp;interest&amp;nbsp;from various, heterogenous target websites. For a pre-defined set of webpages, DEiXTo scrapers will run periodically and monitor them for new content and coming cultural events. However, besides the traditional DOM-based tree patterns, new&amp;nbsp;“smart” and innovative techniques,&amp;nbsp;such as text mining and&amp;nbsp;NLP algorithms, will be used&amp;nbsp;so as to also contend with sites of&amp;nbsp;unknown structure and layout.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;myVisitPlanner officially started a few months ago and its duration is 36 months. We are really glad&amp;nbsp;that we are participating in this challenging and exciting project and we hope that DEiXTo&amp;nbsp;will help myVisitPlanner towards implementing its ambitious goals.&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-8311653756313934665?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/11/myvisitplanner.html</link><author>noreply@blogger.com (kntonas)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-7489962161980537841</guid><pubDate>Tue, 08 Nov 2011 20:36:00 +0000</pubDate><atom:updated>2012-01-28T18:22:02.047+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">E-Learning</category><category domain="http://www.blogger.com/atom/ns#">FP7</category><category domain="http://www.blogger.com/atom/ns#">TEL-MAP</category><title>TEL-MAP &amp; DEiXTo</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;DEiXTo was recently used successfully  in the context of the &lt;a href="http://telmap.org/"&gt;TEL-MAP&lt;/a&gt; FP7 project in order to scrape the metadata of several European Technology Enhanced Learning (TEL) projects from the &lt;a href="http://cordis.europa.eu/home_en.html"&gt;CORDIS &lt;/a&gt;website as well as from the &lt;a href="http://ec.europa.eu/index_en.htm"&gt;European Commission&lt;/a&gt; website.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;TEL-MAP is a Coordination and Support Action funded by the European Commission  under the Technology-Enhanced Learning programme. It is coordinated by the Brunel University of London and it focuses on  exploratory / roadmapping activities for fundamentally new forms of  learning to support the adoption of those new forms, via awareness  building and knowledge management on the results of EU RTD projects in  TEL and socio-economic evaluations in education.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-0wsj6DQvrXA/Trk38O07OrI/AAAAAAAAAC8/qutF3sZ1FJI/s1600/telmap-logo.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="38" src="http://4.bp.blogspot.com/-0wsj6DQvrXA/Trk38O07OrI/AAAAAAAAAC8/qutF3sZ1FJI/s1600/telmap-logo.png" width="133" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;Currently, the TEL-MAP team is building a new portal called "&lt;a href="http://www.learningfrontiers.eu/"&gt;Learning Frontiers&lt;/a&gt;". This portal aspires to become a widely recognized, single-point-of-access source of information for all European TEL. Among others, it offers a Projects space that contains a lot of detailed information about numerous TEL EU-funded projects and their participants (essentially the data scraped from the target web pages). It is really worth noting that through using the geo-location info of each participant / organization, Learning Frontiers provides an interactive map of TEL in Europe!&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-UfyUTqpG6Lg/Trk4e8LooPI/AAAAAAAAADE/JzPdZ3IT1KI/s1600/map-learningfrontiers.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="237" src="http://2.bp.blogspot.com/-UfyUTqpG6Lg/Trk4e8LooPI/AAAAAAAAADE/JzPdZ3IT1KI/s400/map-learningfrontiers.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;We are really happy that we have helped the Learning Frontiers  portal a little towards the implementation of its ambitious vision to increase EU-wide and global dissemination, adoption and impact of EU TEL. We wish them best success!&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-7489962161980537841?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/11/tel-map-deixto.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-0wsj6DQvrXA/Trk38O07OrI/AAAAAAAAAC8/qutF3sZ1FJI/s72-c/telmap-logo.png" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-7785458361000302731</guid><pubDate>Mon, 31 Oct 2011 18:45:00 +0000</pubDate><atom:updated>2012-01-28T18:07:33.406+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Europeana</category><category domain="http://www.blogger.com/atom/ns#">Athos Memory</category><category domain="http://www.blogger.com/atom/ns#">Veria Central Public Library</category><category domain="http://www.blogger.com/atom/ns#">Data transformations</category><title>DEiXTo &amp; Athos Memory</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;&lt;span class="s1" style="font-family: Times,'Times New Roman',serif; font-size: small; text-align: justify;"&gt;In the context of our collaboration with the awarded&amp;nbsp;&lt;a href="http://www.libver.gr/"&gt;&lt;span class="s2"&gt;Veria Central Public Library&lt;/span&gt;&lt;/a&gt;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;, a really remarkable Greek, online digital collection, &lt;/span&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://www.athosmemory.com/en" style="font-family: Times,'Times New Roman',serif; text-align: justify;"&gt;Athos Memory&lt;/a&gt;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small; text-align: justify;"&gt;, has been scraped through DEiXTo in order to be added to the &lt;/span&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://www.europeana.eu/portal" style="font-family: Times,'Times New Roman',serif; text-align: justify;"&gt;European Library&lt;/a&gt;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small; text-align: justify;"&gt;. Athos Memory has been a giant effort of the monastic community of the Holy Mountain to preserve and disseminate the unique religious tradition of the Eastern Orthodox Church on this peninsula of Chalcidice. Numerous people have worked tirelessly for years to make&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small; text-align: justify;"&gt; this endeavour possible. We would really like to congratulate and thank them for their great efforts and for providing open access to this magnificent collection.&lt;/span&gt;&lt;/div&gt;&lt;div class="p1"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://www.athosmemory.com/"&gt;&lt;img border="0" height="28" src="http://4.bp.blogspot.com/-c1cbFciceFg/Tq7h1CMBgoI/AAAAAAAAACc/PE1zU6DgeGs/s320/athosmemory.jpg" width="320" /&gt;&lt;/a&gt;&lt;/span&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;The metadata of &lt;i&gt;27.223&lt;/i&gt; photographs, documents and digitalized manuscripts from Athos, the Sacred Mountain of Christianity,&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;have been transformed into Europeana Semantic Elements (ESE) format so that they could then be inserted into the &lt;/span&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://aggregator.libver.gr/" style="font-family: Times,'Times New Roman',serif;"&gt;Hellenic Aggregator&lt;/a&gt;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;'s database.&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="p1"&gt;&lt;div style="text-align: justify;"&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="p1"&gt;&lt;div style="text-align: justify;"&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;To give you a better idea of the transformation process, check out the picture below. It's a screenshot of a typical item of Athos Memory archives.&lt;/span&gt;&lt;br /&gt;
&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://www.athosmemory.com/index.php?option=com_vivliothiki&amp;amp;func=detail&amp;amp;id=10976"&gt;&lt;img border="0" height="320" src="http://1.bp.blogspot.com/-qyYVJs_-DME/Tq7mc1NBMrI/AAAAAAAAACs/MdD7eruQHsA/s320/athos-record.png" width="300" /&gt;&lt;/a&gt;&lt;/span&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;span style="font-size: small;"&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;&lt;div class="p1"&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;Now, this record, after extracting and repurposing its metadata based on Dublin Core, gets the following form, suitable for exporting it:&lt;/span&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;span style="font-size: small;"&gt;&lt;a href="http://2.bp.blogspot.com/-StdOVBEsaKk/Tq7ruudBQKI/AAAAAAAAAC0/6-9k5sSnOUk/s1600/athos-ese.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="220" src="http://2.bp.blogspot.com/-StdOVBEsaKk/Tq7ruudBQKI/AAAAAAAAAC0/6-9k5sSnOUk/s320/athos-ese.png" width="320" /&gt;&lt;/a&gt;&lt;/span&gt;&lt;/div&gt;&lt;div class="p1" style="text-align: justify;"&gt;&lt;span class="Apple-style-span" style="font-family: Times,'Times New Roman',serif; font-size: small;"&gt;Finally, it should be noted that this was the fourth digital library that was included in the Europeana with the help of DEiXTo. And we are eager to add more online resources and help Europeana enrich further its huge cultural and scientific collection!&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-7785458361000302731?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/10/deixto-athos-memory.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-c1cbFciceFg/Tq7h1CMBgoI/AAAAAAAAACc/PE1zU6DgeGs/s72-c/athosmemory.jpg" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-3675199846417053949</guid><pubDate>Sat, 08 Oct 2011 06:38:00 +0000</pubDate><atom:updated>2012-01-28T19:05:47.168+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">Europeana</category><category domain="http://www.blogger.com/atom/ns#">Music Library Lilian Voudouri</category><category domain="http://www.blogger.com/atom/ns#">Veria Central Public Library</category><category domain="http://www.blogger.com/atom/ns#">Data transformations</category><title>DEiXTo &amp; Veria Central Public Library</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div style="text-align: justify;"&gt;One of DEiXTo's most important success stories is our collaboration with &lt;a href="http://www.libver.gr/"&gt;Veria Central Public Library&amp;nbsp;&lt;/a&gt;in the context of the &lt;a href="http://www.europeanalocal.eu/"&gt;EuropeanaLocal&lt;/a&gt; project.&amp;nbsp;Veria Central Public Library is a really remarkable library that embraces technology and constitutes a&amp;nbsp;successful model for libraries in Greece and around the world. That's why it received a &lt;a href="http://www.gatesfoundation.org/press-releases/Pages/access-to-learning-award-2010-veria-public-library-greece-081210.aspx" target="_blank"&gt;1$ Million international award&lt;/a&gt; from Bill &amp;amp; Melinda Gates Foundation in 2010.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;&lt;a href="http://www.libver.gr/" imageanchor="1" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"&gt;&lt;img border="0" height="55" src="http://2.bp.blogspot.com/-5bMKPhARjrY/To_z9UQxgkI/AAAAAAAAACM/3RSRVym6o-c/s200/libver-logo.gif" width="100" /&gt;&lt;/a&gt;&lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; powers the &lt;a href="http://aggregator.libver.gr/"&gt;Hellenic Aggregator&lt;/a&gt; for Europeana, created by Veria Central Public Library.&amp;nbsp;DEiXToBot based Perl scripts have enabled the metadata extraction of the &lt;a href="http://digma.mmb.org.gr/"&gt;Music Library of Greece “Lilian Voudouri”&lt;/a&gt;, the &lt;a href="http://www.edutv.gr/"&gt;Greek Educational TV&lt;/a&gt; and &lt;a href="http://www.corgialenios.gr/library/"&gt;Corgialenios Digital Library&lt;/a&gt; in a format suitable for further processing. Once extracted, their rich content was repurposed through customized Perl code and&amp;nbsp;transformed into Europeana Semantic Elements (&lt;a href="http://www.europeana.eu/schemas/ese/" target="_blank"&gt;ESE&lt;/a&gt;) format so that it could then be inserted into the aggregator's database.&lt;br /&gt;
&lt;br /&gt;
This is the reason why DEiXTo was cited at the &lt;a href="http://blog.libver.gr/en/?p=311" target="_blank"&gt;Symposium "Europeana in Greece"&lt;/a&gt; that took place on 19 October 2010 in Athens, Greece, as well as at the &lt;a href="http://library.panteion.gr/19libconf/index_en.php" target="_blank"&gt;19th Hellenic Academic Libraries Conference&lt;/a&gt; (3-5 November 2010, Athens).&lt;br /&gt;
&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;a href="http://www.europeana.eu/portal/" imageanchor="1" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"&gt;&lt;img border="0" height="38" src="http://4.bp.blogspot.com/-bENzeRFeNu8/To_2MdQWQyI/AAAAAAAAACQ/9pjJfa6Yvzk/s200/think_culture_logo_top_6.jpg" width="30" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div style="text-align: justify;"&gt;Hopefully, more digital libraries/archives will use &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; in the next few months in order to be able to export their metadata to the&amp;nbsp;great &lt;a href="http://www.europeana.eu/portal/"&gt;Europeana&lt;/a&gt; collection (more than 15 million items from 1.500 institutions!). And we are more than glad to help Europeana enrich its content even more!&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-3675199846417053949?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/10/deixto-veria-central-public-library.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-5bMKPhARjrY/To_z9UQxgkI/AAAAAAAAACM/3RSRVym6o-c/s72-c/libver-logo.gif" height="72" width="72" /><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-1889649176113674002</guid><pubDate>Sat, 17 Sep 2011 14:23:00 +0000</pubDate><atom:updated>2012-01-28T18:36:41.517+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">JavaScript</category><category domain="http://www.blogger.com/atom/ns#">Selenium</category><category domain="http://www.blogger.com/atom/ns#">spynner</category><title>DEiXToBot &amp; Lack of JavaScript Support</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;br /&gt;
&lt;div class="p1"&gt;Perhaps the major drawback of DEiXToBot (the Perl browser emulator object capable of executing GUI DEiXTo generated patterns) is the lack of JavaScript support, which derives&amp;nbsp;from the fact that WWW::Mechanize doesn't operate on JavaScript.&amp;nbsp;In many cases though, a solution is possible by figuring out what the JavaScript code is doing&amp;nbsp;and simulating it via Perl programming.&amp;nbsp;But in certain, more difficult cases that depend heavily on Javascript, this is very hard, if not impossible, because essentially you cannot reach the actual html source code of interest.&lt;/div&gt;&lt;div class="p1"&gt;&lt;br /&gt;
However, a&amp;nbsp;workaround that sometimes works is to download&amp;nbsp;the target pages of interest locally (after executing their Javascript segments of code) and then pass them to DEiXToBot for offline scraping.&lt;/div&gt;&lt;div class="p1"&gt;Two remarkable tools for getting complex Javascript-enabled pages for this purpose are:&lt;/div&gt;&lt;div class="p1"&gt;- &lt;a href="http://seleniumhq.org/"&gt;Selenium&lt;/a&gt;, an amazing web browser automation tool and&lt;/div&gt;&lt;div class="p1"&gt;- &lt;a href="http://pypi.python.org/pypi/spynner"&gt;spynner&lt;/a&gt;, a powerful web browsing module with Ajax support for Python&lt;/div&gt;&lt;div class="p1"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="p1"&gt;Please note that these two great tools also work fine on GNU/Linux, which is really important,&amp;nbsp;especially for server use and scheduled, periodic execution of wrappers.&lt;/div&gt;&lt;div class="p1"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="p1"&gt;So, once a target page is stored locally to your disk, the DEiXToBot agent can easily get the page&amp;nbsp;through the file:// scheme and extract bits of interest in the usual manner.&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-1889649176113674002?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/09/deixtobot-lack-of-javascript-support.html</link><author>noreply@blogger.com (kntonas)</author><thr:total>0</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-4953097419004121506</guid><pubDate>Sun, 26 Jun 2011 14:02:00 +0000</pubDate><atom:updated>2012-01-28T18:36:13.128+02:00</atom:updated><category domain="http://www.blogger.com/atom/ns#">PDF Downloader</category><category domain="http://www.blogger.com/atom/ns#">Διαύγεια</category><category domain="http://www.blogger.com/atom/ns#">Downloading</category><title>Μεταφόρτωση PDF αρχείων από τον Δικτυακό τόπο της Διαύγειας</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div&gt;&lt;div style="text-align: justify;"&gt;Εδώ και σχεδόν 9 μήνες, στο πλαίσιο της λειτουργίας του προγράμματος «Διαύγεια», όλα τα κυβερνητικά όργανα, οι φορείς του στενού και ευρύτερου δημόσιου τομέα και οι Ανεξάρτητες αρχές υποχρεούνται πλέον να αναρτούν το σύνολο των αποφάσεων και των δαπανών τους στο Διαδίκτυο και συγκεκριμένα στο δικτυακό τόπο της Διαύγειας (http://et.diavgeia.gov.gr).&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Σε καθημερινή βάση λοιπόν, πολλοί εργαζόμενοι στο Δημόσιο έχουν επιφορτιστεί με την ευθύνη να ανεβάζουν στη Διαύγεια μεγάλο όγκο από εντάλματα πληρωμής και αποφάσεις σε pdf μορφή. Και μάλιστα συνήθως μετά την ανάρτησή τους και αφού έχουν πάρει ΑΔΑ (Αριθμός Διαδικτυακής Ανάρτησης), ο εκάστοτε αρμόδιος υπάλληλος πρέπει να τα "κατεβάσει" χειροκίνητα κάνοντας απανωτά κλικ στους αντίστοιχους συνδέσμους "Λήψη Αρχείου" και να τα τυπώσει, κατά βάση για γραφειοκρατικούς λόγους. Η διαδικασία αυτή ωστόσο είναι χρονοβόρα, ιδιαίτερα όταν το πλήθος των αρχείων είναι μεγάλο.&amp;nbsp;Τυγχάνει να το γνωρίζω αυτό από πρώτο χέρι όντας εργαζόμενος στο &lt;a href="http://www.ihu.edu.gr/"&gt;Διεθνές Πανεπιστήμιο Ελλάδος&lt;/a&gt;.&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Στην εικόνα που ακολουθεί φαίνονται μερικές τυπικές εγγραφές στο site της Διαύγειας για κάποιο φορέα. Βλέπετε δεξιά τους υπερσυνδέσμους (links) για μεταφόρτωση των ενταλμάτων πληρωμής.&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-jBqaTm3aO4o/TgWg_kxmnwI/AAAAAAAAABM/abI856WBTpc/s1600/diavgeia.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="197" src="http://4.bp.blogspot.com/-jBqaTm3aO4o/TgWg_kxmnwI/AAAAAAAAABM/abI856WBTpc/s400/diavgeia.png" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
&lt;div style="text-align: justify;"&gt;Σκεφτήκαμε λοιπόν να φτιάξουμε μία μικρή εφαρμογή η οποία θα εντοπίζει τις διευθύνσεις (URLs) των pdf αρχείων πάνω σε μία σελίδα αποτελεσμάτων της Διαύγειας, π.χ. των Χ τελευταίων που ανέβασε κάποιος την προηγούμενη μέρα, και στη συνέχεια θα τα κατεβάζει μαζικά. Ένα τέτοιο πρόγραμμα πιθανότατα θα μπορούσε να βοηθήσει αρκετούς δημόσιους υπαλλήλους να εξοικονομήσουν κόπο και χρόνο.&lt;/div&gt;&lt;br /&gt;
Η εφαρμογή είναι &lt;a href="http://deixto.com/diavgeia-downloader.zip"&gt;διαθέσιμη&lt;/a&gt; τόσο σε εκδοχή για χρήση σε γραμμή εντολών (σε Windows και Linux) όσο και για χρήση  μέσω γραφικής διεπαφής (GUI) σε περιβάλλον Windows (για το δεύτερο, δείτε στο τέλος του post).&lt;br /&gt;
&lt;br /&gt;
Σε &lt;b&gt;γραμμή εντολών&lt;/b&gt;&amp;nbsp;των Windows (DOS prompt) τρέξτε το diavgeia-downloader.exe (ή το&amp;nbsp;diavgeia-downloader-linux αντίστοιχα σε ένα terminal εάν έχετε Linux) περνώντας ως παράμετρο τη διεύθυνση/URL στόχο και βάζοντας στην παράμετρο limit της διεύθυνσης της σελίδας τον επιθυμητό αριθμό pdf αρχείων. Για παράδειγμα, έστω η σελίδα:&lt;br /&gt;
&lt;ul&gt;&lt;li&gt;&lt;a href="http://et.diavgeia.gov.gr/f/pamak/find/unit:4652/from:0/limit:50"&gt;http://et.diavgeia.gov.gr/f/pamak/find/unit:4652/from:0/limit:50&lt;/a&gt; &lt;/li&gt;
&lt;/ul&gt;που περιέχει τις τελευταίες 50 δαπάνες-αποφάσεις του ΕΛΚΕ του Πανεπιστημίου Μακεδονίας. Για λήψη αυτών των αρχείων τοπικά (στο φάκελο στον οποίο βρίσκεται και το εκτελέσιμο) αρκεί να δoθεί η ακόλουθη εντολή:&lt;br /&gt;
&lt;ul&gt;&lt;li&gt;diavgeia-downloader.exe -url &lt;a href="http://et.diavgeia.gov.gr/f/pamak/find/unit:4652/from:0/limit:50"&gt;http://et.diavgeia.gov.gr/f/pamak/find/unit:4652/from:0/limit:50&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;Η εφαρμογή υποστηρίζει δύο επιπλέον προαιρετικές παραμέτρους: [-dir folder] [-sleep N]&lt;br /&gt;
&lt;ul&gt;&lt;li&gt;όπου folder το όνομα του φακέλου στον οποίο θα αποθηκευθούν τα αρχεία και&lt;/li&gt;
&lt;li&gt;N o αριθμός των δευτερολέπτων των χρονικών παύσεων μεταξύ των εντολών μεταφόρτωσης ώστε να μην επιβαρύνεται ιδιαίτερα ο server της Διαύγειας.&lt;/li&gt;
&lt;/ul&gt;&lt;div style="text-align: justify;"&gt;Μπορείτε να &lt;a href="http://deixto.com/diavgeia-downloader.zip"&gt;κατεβάσετε&lt;/a&gt; τόσο τον πηγαίο κώδικα (σε Perl) όσο και τα εκτελέσιμα αρχεία (για Windows &amp;amp; Linux αντίστοιχα)! Η άδεια χρήσης του προγράμματος είναι η GNU General Public License version 3.&amp;nbsp;Για περιβάλλον Windows μάλιστα υπάρχει και η γραφική διεπαφή (GUI) που κάνει προφανή όλα τα, μάλλον, πολύπλοκα για πολλούς παραπάνω. Πιο κάτω βλέπετε ένα &amp;nbsp;screenshot από το GUI εργαλείο. Η χρήση του είναι πολύ απλή, αρκεί ο χρήστης να δώσει με copy paste τη διεύθυνση της&amp;nbsp;επιθυμητής&amp;nbsp;σελίδας από το site της Διαύγειας και να πατήσει Go!&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-rjAFi9iH17M/TgiISTPHmxI/AAAAAAAAAB0/wUs0VheKhHc/s1600/diavgeia-downloader-gui.PNG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="143" src="http://3.bp.blogspot.com/-rjAFi9iH17M/TgiISTPHmxI/AAAAAAAAAB0/wUs0VheKhHc/s400/diavgeia-downloader-gui.PNG" width="400" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;
Ελπίζουμε η εφαρμογή αυτή να φανεί χρήσιμη. Σχόλια και προτάσεις ευπρόσδεκτα!&lt;/div&gt;&lt;/div&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-4953097419004121506?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/06/pdf.html</link><author>noreply@blogger.com (kntonas)</author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-jBqaTm3aO4o/TgWg_kxmnwI/AAAAAAAAABM/abI856WBTpc/s72-c/diavgeia.png" height="72" width="72" /><thr:total>4</thr:total></item><item><guid isPermaLink="false">tag:blogger.com,1999:blog-3639231664593965268.post-6501061724107921970</guid><pubDate>Tue, 21 Jun 2011 21:54:00 +0000</pubDate><atom:updated>2011-10-08T11:55:14.895+03:00</atom:updated><title>deixto.com/blog</title><description>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div&gt;&lt;div style="text-align: justify;"&gt;Are you looking for a web content extraction tool to scrape data from websites of interest? &lt;a href="http://deixto.com/"&gt;DEiXTo&lt;/a&gt; can probably help you!&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;DEiXTo is an ongoing effort that started back in 2007. It is freely available to download and to our knowledge it is being used by many users as well as some organisations and companies all over the world. Indicatively, DEiXTo's site received more than 5.700 visits from 103 different countries during the last 12 months.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;Today, we launch the blog of DEiXTo aiming a) to keep you apprised you about the wealth of applications and the utility of this exciting tool and b) to bring forward interesting topics around web scraping.&lt;/div&gt;&lt;div style="text-align: justify;"&gt;&lt;br /&gt;
&lt;/div&gt;&lt;div style="text-align: justify;"&gt;We really hope that DEiXTo can be useful for you and this blog helps you towards this direction.&lt;/div&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/3639231664593965268-6501061724107921970?l=deixto.blogspot.com' alt='' /&gt;&lt;/div&gt;</description><link>http://deixto.blogspot.com/2011/06/deixtocomblog.html</link><author>noreply@blogger.com (kntonas)</author><thr:total>0</thr:total></item></channel></rss>

