<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:blogger='http://schemas.google.com/blogger/2008' xmlns:georss='http://www.georss.org/georss' xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-16081779</id><updated>2026-05-13T17:44:05.492+01:00</updated><category term="GBIF"/><category term="BHL"/><category term="visualisation"/><category term="BioStor"/><category term="DOI"/><category term="taxonomy"/><category term="EOL"/><category term="Challenge"/><category term="Wikipedia"/><category term="CrossRef"/><category term="phylogeny"/><category term="Mendeley"/><category term="BioNames"/><category term="identifiers"/><category term="TreeBASE"/><category term="github"/><category term="OpenURL"/><category term="wiki"/><category term="RDF"/><category term="DNA barcoding"/><category term="NCBI"/><category term="Wikidata"/><category term="citation"/><category term="iPad"/><category term="Catalogue of Life"/><category term="Google"/><category term="CouchDB"/><category term="Semantic Web"/><category term="data quality"/><category term="metadata"/><category term="PLoS"/><category term="Knowledge Graph"/><category term="LSID"/><category term="TDWG"/><category term="API"/><category term="Genbank"/><category term="Google Maps"/><category term="Linked data"/><category term="SVG"/><category term="annotation"/><category term="biodiversity informatics"/><category term="guest post"/><category term="linking"/><category term="search"/><category term="Elsevier"/><category term="ION"/><category term="OCR"/><category term="classification"/><category term="data"/><category term="georeferencing"/><category term="javascript"/><category term="trees"/><category term="SPARQL"/><category term="Wikispecies"/><category term="bioguid"/><category term="iSpecies"/><category term="RSS"/><category term="XML"/><category term="ZooKeys"/><category term="data cleaning"/><category term="matching"/><category term="twitter"/><category term="GUIDs"/><category term="Nature"/><category term="specimen codes"/><category term="taxonomic name"/><category term="treemap"/><category term="DjVu"/><category term="NHM"/><category term="ORCID"/><category term="PDF"/><category term="error"/><category term="interface"/><category term="open data"/><category term="publication"/><category term="specimens"/><category term="Cloudant"/><category term="Creative Commons"/><category term="IPNI"/><category term="JSON"/><category term="Open access"/><category term="Zootaxa"/><category term="article 2.0"/><category term="dark taxa"/><category term="duplicates"/><category term="e-Biosphere"/><category term="mashup"/><category term="names"/><category term="rant"/><category term="ALA"/><category term="DataCite"/><category term="FigShare"/><category term="JSON-LD"/><category term="Mediawiki"/><category term="cloud"/><category term="integration"/><category term="phyloinformatics"/><category term="possible project"/><category term="preprint"/><category term="uBio"/><category term="Australian Faunal Directory"/><category term="BBC"/><category term="Bibliography of Life"/><category term="Bob Mesibov"/><category term="Darwin Core Archive"/><category term="Encylcopedia of Life"/><category term="Handles"/><category term="JSTOR"/><category term="Tree of Life"/><category term="Web Hooks"/><category term="WikiCite"/><category term="Xanadu"/><category term="Zotero"/><category term="database"/><category term="hypothes.is"/><category term="iPhone"/><category term="machine learning"/><category term="screencast"/><category term="talk"/><category term="touch"/><category term="video"/><category term="zoom"/><category term="3D"/><category term="BLAST"/><category term="GBIC2012"/><category term="Google books"/><category term="Graphviz"/><category term="ICZN"/><category term="Mac OS X"/><category term="Nature Precedings"/><category term="OAuth"/><category term="PhyLoTA"/><category term="URI"/><category term="XSLT"/><category term="Yahoo"/><category term="Zenodo"/><category term="ZooBank"/><category term="blogs"/><category term="code"/><category term="demo"/><category term="errors"/><category term="fail"/><category term="geocoding"/><category term="geophylogeny"/><category term="gist"/><category term="hOCR"/><category term="mammals"/><category term="note to self"/><category term="pagerank"/><category term="software"/><category term="tags"/><category term="taxonomic concept"/><category term="taxonomic databases"/><category term="text mining"/><category term="timeline"/><category term="tree"/><category term="workshop"/><category term="Apple"/><category term="Atlas of Living Australia"/><category term="BHL-Europe"/><category term="COinS"/><category term="CiteBank"/><category term="DBpedia"/><category term="Darwin Core riplet"/><category term="Dryad"/><category term="ElasticSearch"/><category term="EvolDir"/><category term="Facebook"/><category term="Freebase"/><category term="Gene Wiki"/><category term="GeoJSON"/><category term="GeoRSS"/><category term="Google Docs"/><category term="Google Earth"/><category term="Google Scholar"/><category term="HTTP URI"/><category term="Handle"/><category term="ISSN"/><category term="ITIS"/><category term="KML"/><category term="Kew"/><category term="Leaflet"/><category term="Linkout"/><category term="Microsoft"/><category term="Neo4J"/><category term="OBIS"/><category term="Ozymandias"/><category term="PHP"/><category term="PLoS Hubs"/><category term="Papers"/><category term="Plazi"/><category term="PubMed Central"/><category term="SICI"/><category term="TAXACOM"/><category term="TBMap"/><category term="UUID"/><category term="Uniprot"/><category term="aggregation"/><category term="ajax"/><category term="algorithm"/><category term="bibliometrics"/><category term="biodiversity"/><category term="chameleons"/><category term="citation matching"/><category term="clustering"/><category term="data mining"/><category term="design"/><category term="digitisation"/><category term="ePub"/><category term="frogs"/><category term="history flow"/><category term="iEvoBio"/><category term="impact"/><category term="impact factor"/><category term="iphylo"/><category term="jQueryMobile"/><category term="library"/><category term="links"/><category term="lucene"/><category term="mapping"/><category term="maps"/><category term="markdown"/><category term="microcitations"/><category term="modelling"/><category term="museum"/><category term="nanopublication"/><category term="nomenclators"/><category term="plans"/><category term="platform"/><category term="podcast"/><category term="pro-iBiosphere"/><category term="quantum treemap"/><category term="services"/><category term="specimen"/><category term="stackoverflow"/><category term="sucks"/><category term="synonyms"/><category term="tagging"/><category term="user interface"/><category term="version control"/><category term="vision"/><category term="vizbi"/><category term="&quot;author names&quot;"/><category term="&quot;web service&quot;"/><category term="AFD"/><category term="AMNH"/><category term="Apache"/><category term="BIOONE"/><category term="BMC Bioinformatics"/><category term="Biodiversity Data Journal"/><category term="Bitcoin"/><category term="Bouchout Declaration"/><category term="C++"/><category term="CRF"/><category term="CSL"/><category term="CSS"/><category term="Connotea"/><category term="DSpace"/><category term="DeepDyve"/><category term="DiGIR"/><category term="Docker"/><category term="Dublin Core"/><category term="EAV"/><category term="Flickr"/><category term="Google Code"/><category term="Google Refine"/><category term="GrandChallenge"/><category term="GraphQL"/><category term="IFTTT"/><category term="IRMNG"/><category term="ISNI"/><category term="IUCN"/><category term="Index Fungorum"/><category term="Internet Explorer"/><category term="JACC"/><category term="JATS"/><category term="JournalMap"/><category term="MIT"/><category term="Mammal Species of the World"/><category term="Mesquite"/><category term="MySQL"/><category term="NDE"/><category term="NESCent"/><category term="NLM DTD"/><category term="NSF"/><category term="OZCAM"/><category term="Open Calais"/><category term="Open Science"/><category term="Open source"/><category term="PRISM"/><category term="Pensoft"/><category term="Perceptive Pixel"/><category term="Plant List"/><category term="PubMed"/><category term="PygmyBrowse"/><category term="Semantic Mediawiki"/><category term="Social media"/><category term="TaxPub"/><category term="TaxonRank"/><category term="Ted Nelson"/><category term="Tony Rees"/><category term="TreeView"/><category term="VIAF"/><category term="Vast Machine"/><category term="Vista"/><category term="Windows"/><category term="Wired"/><category term="WoRMS"/><category term="WorldCat"/><category term="XMP"/><category term="ai"/><category term="altmetric"/><category term="altmetrics"/><category term="ants"/><category term="bats"/><category term="bibliographies"/><category term="biogeography"/><category term="birds"/><category term="bit.ly"/><category term="book"/><category term="browser"/><category term="business model"/><category term="cluster maps"/><category term="community"/><category term="conference"/><category term="copyright"/><category term="crowdsourcing"/><category term="deep zoom"/><category term="difference"/><category term="distribution"/><category term="eLife"/><category term="glitch"/><category term="hack"/><category term="hack4knowledge"/><category term="host"/><category term="iTunes"/><category term="identifier"/><category term="index"/><category term="interview"/><category term="license"/><category term="literature"/><category term="markup"/><category term="material examined"/><category term="md5"/><category term="metacrap"/><category term="metagenomics"/><category term="metrics"/><category term="microservices"/><category term="millipedes"/><category term="museum collections"/><category term="ngram"/><category term="obsidian"/><category term="panbiogeography"/><category term="parasite"/><category term="parsing"/><category term="plants"/><category term="power law"/><category term="programming"/><category term="published"/><category term="publishing"/><category term="ranking"/><category term="reconciliation"/><category term="reliability"/><category term="replication"/><category term="scraping"/><category term="scratchpads"/><category term="space tree"/><category term="spider"/><category term="symbiome"/><category term="synonymy"/><category term="taxon concepts"/><category term="teaching"/><category term="tiles"/><category term="timemap"/><category term="topological sorting"/><category term="touch screen"/><category term="tutorial"/><category term="tvwidget"/><category term="type specimens"/><category term="users"/><category term="visualization"/><category term="vocabulary"/><category term="web services"/><category term="wow"/><category term="zoomify"/><category term="&quot;Guy Kawasaki&quot;"/><category term="&quot;Social Graph API&quot;"/><category term="&quot;data wars&quot;"/><category term="&quot;rock pools&quot;"/><category term="&quot;sea level&quot;"/><category term="&quot;table lens&quot;"/><category term="&quot;word for the day&quot;"/><category term="2010"/><category term="2014"/><category term="ABBYY"/><category term="ATOM"/><category term="AVATOL"/><category term="Aedes"/><category term="AgeNames"/><category term="Amazon"/><category term="Android"/><category term="Andy Mabbett"/><category term="Angelina Jolie"/><category term="AntWeb"/><category term="AppleScript"/><category term="AquaMaps"/><category term="ArcGIS"/><category term="Arctos"/><category term="Arthur C Clarke"/><category term="Asterophrys leucopus"/><category term="Atypon"/><category term="Auckland"/><category term="Australian Systematic Botany"/><category term="BHL Africa"/><category term="BHL in a box"/><category term="BLR"/><category term="BMC"/><category term="BNCOD2008"/><category term="BOLD"/><category term="Begonia"/><category term="BibJSON"/><category term="Bio2RDF"/><category term="BioStar"/><category term="Blackwell"/><category term="Blue Planet"/><category term="Bowker"/><category term="British Library"/><category term="Broad Institute"/><category term="CD"/><category term="CERN"/><category term="CGI"/><category term="CVS"/><category term="Carmen Electra"/><category term="CartoDB"/><category term="Chaerephon"/><category term="Charles Sherbon"/><category term="Choerephon"/><category term="Chromis"/><category term="Chærephon"/><category term="Chœrephon"/><category term="CiNii"/><category term="Cisco"/><category term="Cladistics"/><category term="Clay Shirky"/><category term="Cool URIs"/><category term="Cooliris"/><category term="D3"/><category term="DAG"/><category term="DOI trees"/><category term="Darwin"/><category term="David Remsen"/><category term="David Shorthouse"/><category term="Defra"/><category term="Demansia"/><category term="Disqus"/><category term="Drupal"/><category term="E O Wilson"/><category term="ENCODE"/><category term="Earth Microbiome Project"/><category term="Edinburgh"/><category term="Edward Taylor"/><category term="Enhydris punctata"/><category term="Entomologica Scandinavica"/><category term="Europe"/><category term="Evolution2010"/><category term="F1000"/><category term="FAIR"/><category term="Fedora"/><category term="Firefox"/><category term="FishBase"/><category term="Fitzalania"/><category term="Flipboard"/><category term="Fluidinfo"/><category term="FundRef"/><category term="Fungi"/><category term="GB21"/><category term="GBIC"/><category term="GBIO"/><category term="GGBN"/><category term="GIS"/><category term="GML"/><category term="GPI"/><category term="GPS"/><category term="Gallica"/><category term="GeoCouch"/><category term="Glassella"/><category term="Global Forest Watch"/><category term="Global Plant Initiative"/><category term="God"/><category term="Gondwana"/><category term="Google Analytics"/><category term="Google Charts"/><category term="Google Spreadsheets"/><category term="GrBio"/><category term="Greasemonkey"/><category term="Gregg&#39;s paradox"/><category term="H1N1"/><category term="HS_ALIAS"/><category term="Haystack"/><category term="Hendy"/><category term="Heroku"/><category term="Hipponix"/><category term="Hipponyx"/><category term="Holly Bik"/><category term="HomeBrew"/><category term="Hoolock"/><category term="Hoplocephalus"/><category term="Human trafficking"/><category term="Hurlbert&#39;s index"/><category term="Ideator"/><category term="Insect Systematics and Evolution"/><category term="J-Stage"/><category term="JSONP"/><category term="Japanese"/><category term="Knowledge Navigator"/><category term="LSIDs"/><category term="Leptograpsus"/><category term="Life and Literature"/><category term="Lifemapper"/><category term="Linux"/><category term="MOBOT"/><category term="MPE"/><category term="MSW"/><category term="Mac"/><category term="Mac OSX"/><category term="MacClade"/><category term="Map of Life"/><category term="MarkMail"/><category term="Meier"/><category term="Mekong River Schistosomiasis"/><category term="Melbourne"/><category term="Mesibov"/><category term="Metacafe"/><category term="Micropayment"/><category term="Miller column"/><category term="Molossidae"/><category term="Mount Mabu"/><category term="Mus rutilans"/><category term="NASA"/><category term="NEXUS"/><category term="NGS"/><category term="New Category"/><category term="NoCode"/><category term="NoSQL"/><category term="Nomenclator Zoologicus"/><category term="Nuytsia"/><category term="OAI"/><category term="OCLC"/><category term="ODI"/><category term="OTU"/><category term="OneZoom"/><category term="Open Acess"/><category term="Open Refine"/><category term="Open Tree of Life"/><category term="OpenHandle"/><category term="OpenRef"/><category term="OpenRefine"/><category term="OpenStreetMap"/><category term="PLoS Currents"/><category term="PLoS Currents Tree of Life"/><category term="PMID"/><category term="Pagodabox"/><category term="PaleoBioDB"/><category term="Pando"/><category term="PaperID"/><category term="Parallels"/><category term="Penny"/><category term="Perl"/><category term="Peter Norvig"/><category term="Philautus"/><category term="Photosynth"/><category term="Phthiraptera"/><category term="PhyloWS"/><category term="Phytotaxa"/><category term="Pinnixa"/><category term="Pinnotheridae"/><category term="Pit Stop"/><category term="Poly9"/><category term="Prezi"/><category term="Pristimantis"/><category term="ProjectEvoMap"/><category term="Proxy"/><category term="Pteralopex"/><category term="PubPeer"/><category term="Pyramica"/><category term="QR Codes"/><category term="Quora"/><category term="R-tree"/><category term="RAxML"/><category term="ROR"/><category term="RTFM"/><category term="RabbitResearch"/><category term="Raorchestes"/><category term="Raymondia"/><category term="Raymondionymus"/><category term="ReCon16"/><category term="ReaderMeter"/><category term="Readmill"/><category term="Red List"/><category term="Rimba Raya"/><category term="Roger Hyam"/><category term="Ross Mounce"/><category term="Rwanda"/><category term="SEALS"/><category term="SEO"/><category term="SKOS"/><category term="SOAP"/><category term="Science 2.0"/><category term="Science Commons"/><category term="Scispace"/><category term="Scott Federhen"/><category term="Scripting life"/><category term="Sherborn"/><category term="Singapore"/><category term="Solr"/><category term="Species-ID"/><category term="Stamen"/><category term="State of the World&#39;s Plants"/><category term="Stephen Colbert"/><category term="Steve Jobs"/><category term="Strumigenys"/><category term="Sun"/><category term="SysTax"/><category term="Systematic Biology"/><category term="TAPIR"/><category term="TV"/><category term="Talk Science"/><category term="Tasmania"/><category term="Taxobox"/><category term="Taylor and Francis"/><category term="Thamnomys rutilans"/><category term="The Plant List"/><category term="TinyURL"/><category term="Top 10"/><category term="TraitBank"/><category term="TreeView X"/><category term="Trove"/><category term="UFT-8"/><category term="URL shortening"/><category term="URN"/><category term="USIN"/><category term="USNM"/><category term="UTF8"/><category term="UTM grid reference"/><category term="Unicorn"/><category term="University of Alaska"/><category term="Venter"/><category term="Veridium"/><category term="VertNet"/><category term="ViBRANT"/><category term="Vince Smith"/><category term="Wallace"/><category term="Wardley map"/><category term="WebDAV"/><category term="Webdot"/><category term="Wellcome"/><category term="Wikibase"/><category term="Wikiometrics"/><category term="Wikisource"/><category term="Wiley"/><category term="Wine"/><category term="Wordle"/><category term="ZFMK"/><category term="Zemanta"/><category term="Zika virus"/><category term="alignment-free phylogeny"/><category term="amber"/><category term="apps"/><category term="art"/><category term="articles"/><category term="author names"/><category term="authorship"/><category term="background"/><category term="bibliographic coupling"/><category term="big data"/><category term="billion"/><category term="bioRxiv"/><category term="bioinformatics"/><category term="biomedical"/><category term="bookmarklets"/><category term="books"/><category term="bryozoa"/><category term="c-squares"/><category term="canonical name"/><category term="canvas"/><category term="carbon offsets"/><category term="cards"/><category term="career suicide"/><category term="character encoding"/><category term="chresonym"/><category term="citation context"/><category term="citation mutation"/><category term="citation needed"/><category term="citations"/><category term="citekey"/><category term="climate"/><category term="close to the bone"/><category term="clusterfuck"/><category term="co-occurrence"/><category term="collaboration"/><category term="collections"/><category term="compiling"/><category term="computers"/><category term="conservation status"/><category term="containers"/><category term="contest"/><category term="conversation"/><category term="coronavirus"/><category term="cospeciation"/><category term="coverage"/><category term="crash"/><category term="crazy"/><category term="cryptocurrency"/><category term="curation"/><category term="cyberscience"/><category term="d3js"/><category term="dashboard"/><category term="data coupling"/><category term="data entry"/><category term="data grief"/><category term="data integration"/><category term="data model"/><category term="data preservation"/><category term="datasette"/><category term="dechronization"/><category term="deduplication"/><category term="dggs"/><category term="digital humanities"/><category term="digital library"/><category term="digitising"/><category term="dimly lit taxa"/><category term="dinosaurs"/><category term="disaambiguation"/><category term="displacement"/><category term="document layout"/><category term="dogfooding"/><category term="domain names"/><category term="dot"/><category term="duplication"/><category term="eLife Lens"/><category term="ebio09"/><category term="editing"/><category term="encoding"/><category term="environmental DNA"/><category term="equirectangular projection"/><category term="evolutionary biology"/><category term="exhaustion"/><category term="expert"/><category term="extensions"/><category term="extraction"/><category term="failure"/><category term="false positive"/><category term="family"/><category term="fictional taxa"/><category term="filesystem"/><category term="filtered-push"/><category term="flow"/><category term="folksonomy"/><category term="font"/><category term="fonts"/><category term="forking data"/><category term="fossil"/><category term="frankenplace"/><category term="freemium"/><category term="future"/><category term="games"/><category term="gaps"/><category term="generous interface"/><category term="genus"/><category term="geography"/><category term="gibbons"/><category term="git"/><category term="grant"/><category term="graph"/><category term="graph database"/><category term="grid"/><category term="h-score"/><category term="hackathon"/><category term="half-baked"/><category term="help"/><category term="holotypes"/><category term="homonym"/><category term="hyperbolic tree"/><category term="iBOL"/><category term="iBook"/><category term="iBooks"/><category term="iCal"/><category term="iDigBio"/><category term="iNaturalist"/><category term="iSpiders"/><category term="icon"/><category term="idea"/><category term="identfiiers"/><category term="identiifers"/><category term="identity"/><category term="ideology"/><category term="images"/><category term="imagination"/><category term="indirection"/><category term="jQuery"/><category term="jellyfish"/><category term="journals"/><category term="joy"/><category term="lazy load"/><category term="legacy literature"/><category term="lice"/><category term="ligature"/><category term="linked data. Zitgist"/><category term="list"/><category term="liverwort"/><category term="llm"/><category term="logo"/><category term="long tail"/><category term="longest common substring"/><category term="macrobiome"/><category term="macroscope"/><category term="mailing list"/><category term="manifesto"/><category term="map"/><category term="matrix"/><category term="megascience"/><category term="memcached"/><category term="merging"/><category term="microbiome"/><category term="microformat"/><category term="mobile tagging"/><category term="mockup"/><category term="mod_rewrite"/><category term="model"/><category term="mollusc"/><category term="mosquitoes"/><category term="museums"/><category term="n-grams"/><category term="namestream"/><category term="natural language"/><category term="natural language queries"/><category term="navigation"/><category term="new species"/><category term="node.js"/><category term="nomenclature"/><category term="number of species"/><category term="obituary"/><category term="ontology"/><category term="orthographic projection"/><category term="output"/><category term="pPod"/><category term="parallax"/><category term="parasites"/><category term="patent"/><category term="paywall"/><category term="phylgeny"/><category term="phylogenetic diversity"/><category term="phylowidget"/><category term="piracy"/><category term="planet management"/><category term="platforms"/><category term="points"/><category term="polar"/><category term="pollution"/><category term="post-taxonomic"/><category term="postphylogenetics"/><category term="precision"/><category term="predictions"/><category term="presentation"/><category term="press"/><category term="prize"/><category term="rank"/><category term="reCAPTCHA"/><category term="rectangle packing"/><category term="red lionfish"/><category term="regular expression"/><category term="release"/><category term="rent"/><category term="research sprint"/><category term="rewrite"/><category term="rtree"/><category term="ruby"/><category term="sailfin lizards"/><category term="schema.org"/><category term="schistosomiasis"/><category term="science"/><category term="serverless"/><category term="shape files"/><category term="silos"/><category term="slideshare"/><category term="snakes"/><category term="social citation"/><category term="social graph"/><category term="sociology"/><category term="sparklines"/><category term="speaking"/><category term="species"/><category term="spelling correction"/><category term="spy"/><category term="squid"/><category term="stained glass"/><category term="startup"/><category term="stratigraphy"/><category term="string"/><category term="structured data"/><category term="success"/><category term="summary trees"/><category term="supertree"/><category term="swine flu"/><category term="systematics"/><category term="table"/><category term="tag tree"/><category term="taxonomic intelligence"/><category term="taxonomic names"/><category term="taxonomists"/><category term="test suite"/><category term="text-mining"/><category term="thesis"/><category term="threads"/><category term="to do"/><category term="transitive reduction"/><category term="treatments"/><category term="tree edit"/><category term="triple store"/><category term="trust"/><category term="twittervision"/><category term="unpaywall"/><category term="use case"/><category term="velcro"/><category term="visulaisation"/><category term="vouchers"/><category term="we feel fine"/><category term="weather"/><category term="web"/><category term="whales"/><category term="white paper"/><category term="wikicite2017"/><category term="wordtrees"/><category term="worm"/><category term="Æ"/><title type='text'>iPhylo</title><subtitle type='html'>Rants, raves (and occasionally considered opinions) on phyloinformatics, taxonomy, and biodiversity informatics.  For more ranty and less considered opinions, see my &lt;a href=&quot;https://twitter.com/rdmpage&quot;&gt;Twitter feed&lt;/a&gt;.&lt;br&gt;ISSN 2051-8188. Written content on this site is licensed under a &lt;a href=&quot;https://creativecommons.org/licenses/by/4.0/&quot;&gt;Creative Commons Attribution 4.0 International license&lt;/a&gt;.</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='https://iphylo.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default?alt=atom'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default?alt=atom&amp;start-index=26&amp;max-results=25'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>869</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-16081779.post-8352553611158999053</id><published>2026-05-13T17:42:20.908+01:00</published><updated>2026-05-13T17:44:05.492+01:00</updated><title type='text'>A new way to view the Tree of Life</title><content type='html'>&lt;p&gt;One of the grand challenges of comparative biology is to assemble the [“tree of life”](&lt;a href=&quot;https://en.wikipedia.org/wiki/Tree_of_life_(biology)&quot;&gt;https://en.wikipedia.org/wiki/Tree_of_life_(biology)&lt;/a&gt;, a diagram that connects all species in a single structure (let’s leave aside for now the question of whether a tree is actually the best representation). My goal here is to outline a way of navigating the tree of life, specifically the &lt;a href=&quot;https://opentreeoflife.github.io&quot;&gt;Open Tree of Life&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Given a tree with some 2 million species, the obvious question is how can we visualise it? There are several projects that can accommodate trees of this size, such as Vienne’s &lt;a href=&quot;https://lifemap.cnrs.fr&quot;&gt;LifeMap&lt;/a&gt;, Rosindell’s &lt;a href=&quot;https://www.onezoom.org&quot;&gt;OneZoom&lt;/a&gt;, and &lt;a href=&quot;https://www.taxonium.org&quot;&gt;Taxonium&lt;/a&gt;. Each of these viewers is impressive in their own way, but in my opinion each has problems. LifeMap treats the tree as a static structure in 2D space and uses &lt;a href=&quot;https://en.wikipedia.org/wiki/Tiled_web_map&quot;&gt;tiles&lt;/a&gt; to enable the user to zoom in and out in the same way we navigate a digital map. Because trees are mostly empty space it is easy to get lost. OneZoom uses an almost hypnotic fractal tree layout, coupled with zooming in and out - a similar approach to LifeMap but with a different way to render the tree. It is fun, but the fractal pattern distorts aspects of the tree. Taxonium takes a different approach, the complete tree is rendered in 2D and is uniformly zoomed on the y-axis, stretching it out.&lt;/p&gt;
&lt;p&gt;None of these projects has felt satifsfactory to me. They often don’t use the screen area efficiently, labels can be hard to read, and they treat tree visualisation as simply scaling or stretching a fixed layout. Open Tree of Life itself has a viewer tries a different approach to showing the tree, collapsing various nodes, but it feels clunky in comparison to the other viewers. This is a pity, because the Open Tree of Life is a fascinating project, a supertree that is regularly(ish) updated with new phylogenies, and which links to evidence for each node in that tree.&lt;/p&gt;
&lt;p&gt;For a while I’ve been exploring a method called &lt;a href=&quot;https://iphylo.blogspot.com/2021/05/maximum-entropy-summary-trees-to.html&quot;&gt;summary trees&lt;/a&gt; to display large trees, such as taxonomic classifications (based on work by Karloff and Shirley). The key feature of a summary tree is that you collapse a tree to a specified number of nodes (or leaves), which means you can ensure that the tree fits into your display space, and hence that all labels are legible. The trick is to figure out what nodes to collapse. I’ve used the approach of Libin et al. that partitions a tree based on a score given to each node.&lt;/p&gt;
&lt;p&gt;This is a nice idea, but if you fit the tree of life into a browser window say, 30 lines high, then how do you see the rest of the tree? One approach would be to treat growing the tree as a form of zooming, so that one level of zoom would grow the tree to twice the size, and so on, and you would then have to pan to see the whole tree. I think this has potential for individual phylogenies, but for really big trees you just end up getting lost.&lt;/p&gt;
&lt;p&gt;Instead, what if you clicked on a node in the tree and that node became the root of a new tree that you could explore, and that tree would be guaranteed to fit in your window? So you browse through the tree, making different parts fan out or collapse as needed.&lt;/p&gt;
&lt;p&gt;This seemed appealing, but animating the transition between trees felt rather beyond my programming skills… so I asked ChatGPT and Claude for help. Part of the challenge to problem solving is understanding what the actual problem is. ChatGPT introduced me to the idea of a “transition scene” where you have the before and after trees, and you compute how one transforms into the other. Claude Code made this a reality, and now I could smoothly navigate around the tree. Obviously, starting at the root of the whole tre everytime would get tedious, so I added a simple search tool to find a node in the tree to start from.&lt;/p&gt;
&lt;p&gt;So we have a the notion of collapsing a tree to a given size (summary trees), a way to decide what nodes to collapse (a combination of a scoring scheme and a &lt;a href=&quot;https://en.wikipedia.org/wiki/Priority_queue&quot;&gt;priority queue&lt;/a&gt;), and we use transition scenes to move between trees. You can see the result of all this here: &lt;a href=&quot;https://iphylo.org/ott-viewer&quot;&gt;https://iphylo.org/ott-viewer&lt;/a&gt;.&lt;/p&gt;


&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjJhZ0e2tPuDlu3vEiD_7p2bkqxbnmGxnm5KQu_wUX871jBZ36YVfVlBOJtrTjSXjDYiftPgR8vxh6h1LfreZEv0o3D86tQwjPGs9iAIq1hxaoYe06OudxXLov_7AjTwo1plP9qtEUYQ78vB7lxfnbWezzMzgq6Km7lk0psoia2Nvgdz_ctNLIW/s1904/ott241840.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1780&quot; data-original-width=&quot;1904&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjJhZ0e2tPuDlu3vEiD_7p2bkqxbnmGxnm5KQu_wUX871jBZ36YVfVlBOJtrTjSXjDYiftPgR8vxh6h1LfreZEv0o3D86tQwjPGs9iAIq1hxaoYe06OudxXLov_7AjTwo1plP9qtEUYQ78vB7lxfnbWezzMzgq6Km7lk0psoia2Nvgdz_ctNLIW/s400/ott241840.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;


&lt;p&gt;Having got a browseable tree working, the next issue is how do you go “back”, and what does “going back” even mean? We can wire up the browser’s back button to take you back to the previous tree, but I wanted something more. I’d come across a paper that described “Hoptrees” which shows your navigation history not as a simple linear list of where you have been, but arranges that history as a tree. This felt like a natural fit for navigating the tree of life, and hence above the tree you will see your navigation history as a simplified version of the larger tree.&lt;/p&gt;
&lt;p&gt;As always there is more that could be done, but this feels like a natural stopping point. The tree browser works, and when I use it I spend less time thinking about the interface and more about the relationships in the tree, and that feels as it should be.&lt;/p&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;p&gt;Brooks, M., West, J. D., Aragon, C. R., &amp;amp; Bergstrom, C. T. (2013). Hoptrees: Branching History Navigation for Hierarchies. In P. Kotzé, G. Marsden, G. Lindgaard, J. Wesson, &amp;amp; M. Winckler (Eds),  &lt;em&gt;Human-Computer Interaction – INTERACT 2013&lt;/em&gt;  (pp. 316–333). Springer.  &lt;a href=&quot;https://doi.org/10.1007/978-3-642-40477-1_20&quot;&gt;https://doi.org/10.1007/978-3-642-40477-1_20&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Karloff, H., &amp;amp; Shirley, K. E. (2013). Maximum Entropy Summary Trees.  &lt;em&gt;Computer Graphics Forum&lt;/em&gt;,  &lt;em&gt;32&lt;/em&gt;(3pt1), 71–80.  &lt;a href=&quot;https://doi.org/10.1111/cgf.12094&quot;&gt;https://doi.org/10.1111/cgf.12094&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Libin, P., Vanden Eynden, E., Incardona, F., Nowé, A., Bezenchek, A., EucoHIV Study Group, Sönnerborg, A., Vandamme, A.-M., Theys, K., &amp;amp; Baele, G. (2017). PhyloGeoTool: Interactively exploring large phylogenies in an epidemiological context.  &lt;em&gt;Bioinformatics&lt;/em&gt;,  &lt;em&gt;33&lt;/em&gt;(24), 3993–3995.  &lt;a href=&quot;https://doi.org/10.1093/bioinformatics/btx535&quot;&gt;https://doi.org/10.1093/bioinformatics/btx535&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Page, R. D. (2012). Space, time, form: Viewing the Tree of Life.  &lt;em&gt;Trends in Ecology &amp;amp; Evolution&lt;/em&gt;,  &lt;em&gt;27&lt;/em&gt;(2), 113–120.&lt;/p&gt;
&lt;p&gt;Sanderson, T. (2022). Taxonium, a web-based tool for exploring large phylogenetic trees.  &lt;em&gt;eLife&lt;/em&gt;,  &lt;em&gt;11&lt;/em&gt;, e82392.  &lt;a href=&quot;https://doi.org/10.7554/eLife.82392&quot;&gt;https://doi.org/10.7554/eLife.82392&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;De Vienne, D. M. (2016). Lifemap: Exploring the Entire Tree of Life.  &lt;em&gt;PLOS Biology&lt;/em&gt;,  &lt;em&gt;14&lt;/em&gt;(12), e2001624.  &lt;a href=&quot;https://doi.org/10.1371/journal.pbio.2001624&quot;&gt;https://doi.org/10.1371/journal.pbio.2001624&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Wong, Y., &amp;amp; Rosindell, J. (2022). Dynamic visualisation of million‐tip trees: The OneZoom project.  &lt;em&gt;Methods in Ecology and Evolution&lt;/em&gt;,  &lt;em&gt;13&lt;/em&gt;(2), 303–313.  &lt;a href=&quot;https://doi.org/10.1111/2041-210X.13766&quot;&gt;https://doi.org/10.1111/2041-210X.13766&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/8352553611158999053'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/8352553611158999053'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/05/a-new-way-to-view-tree-of-life.html' title='A new way to view the Tree of Life'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjJhZ0e2tPuDlu3vEiD_7p2bkqxbnmGxnm5KQu_wUX871jBZ36YVfVlBOJtrTjSXjDYiftPgR8vxh6h1LfreZEv0o3D86tQwjPGs9iAIq1hxaoYe06OudxXLov_7AjTwo1plP9qtEUYQ78vB7lxfnbWezzMzgq6Km7lk0psoia2Nvgdz_ctNLIW/s72-c/ott241840.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-2421586361991997402</id><published>2026-05-04T12:30:20.960+01:00</published><updated>2026-05-08T10:51:57.173+01:00</updated><title type='text'>Alpha shapes and DNA barcoding</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2026). Alpha shapes and DNA barcoding. &lt;a href=&quot;https://doi.org/10.59350/qx8j9-vam77&quot;&gt;https://doi.org/10.59350/qx8j9-vam77&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;DNA barcoding generates a lot of specimen data with geographical coordinates (see for example  &lt;a href=&quot;https://iphylo.blogspot.com/2013/12/guest-post-response-to-genbank-data-on.html&quot;&gt;Guest post: response to “Putting GenBank Data on the Map”&lt;/a&gt;). The question naturally arises: “how accurate are those coordinates?”.&lt;/p&gt;
&lt;p&gt;Browsing the BOLD database using &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com&quot;&gt;BOLD View&lt;/a&gt; I often come across sequences whose coordinates are labelled “Coordinates from country centroid”, so these may bear little relation to where the specimen was actually collected. But how can we assess the accuracy of other coordinates?&lt;/p&gt;
&lt;p&gt;Inspired by a 2008 Flickr blog post &lt;a href=&quot;https://code.flickr.net/2008/10/30/the-shape-of-alpha/&quot;&gt;The Shape of Alpha&lt;/a&gt; I decided to create plots of the distribution of geotagged specimens in the BOLD database, grouped by geographic level. For example, we could aggregate all points labelled as being from the country “India”, then subset those into points labelled as being from various regions within India, and so on down the geographic hierarchy implied by country, province, etc. Rather than plot all the points, I decided to sumamrise them using the same approach Flickr used, we enclose the points in an &lt;a href=&quot;https://en.wikipedia.org/wiki/Alpha_shape&quot;&gt;alpha shape&lt;/a&gt;. Below are examples for India.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjt_TiX10YpmYTG8qoEhL06XtwHeGpDKYZ6BEJ0yWeerrhDCAlPSYZt44MDqEXaL8kmCk1ftsNXt_NluBwxncmiGZZ6yk7z-g8LErOyQvJkDG4mzsj6jaBsUWnjOFzRs0hLNw6QGydu7RilgQu4_ul_No6qTh-pxCR7dbF1H8PWr3knGPtoifMb/s1732/Screenshot%202026-05-04%20at%2012.17.02.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1632&quot; data-original-width=&quot;1732&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjt_TiX10YpmYTG8qoEhL06XtwHeGpDKYZ6BEJ0yWeerrhDCAlPSYZt44MDqEXaL8kmCk1ftsNXt_NluBwxncmiGZZ6yk7z-g8LErOyQvJkDG4mzsj6jaBsUWnjOFzRs0hLNw6QGydu7RilgQu4_ul_No6qTh-pxCR7dbF1H8PWr3knGPtoifMb/s400/Screenshot%202026-05-04%20at%2012.17.02.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiHzsA_j7ucjKB1U9exdbi5z7EJkli3ANi6_VHkReRgLtWHlvo-wM4UQXE1NSnELxfUI4ViOizpwIvJONF2AsWI3TX7LoXPxK9pic1HQWNfoQ5hXCOs9l175jcaG_btXOaNay-9qsHPOSWabAdujbtirXVx73CPDPVEV5GYFuCRgb-_b01exx6d/s1730/Screenshot%202026-05-04%20at%2012.17.34.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1628&quot; data-original-width=&quot;1730&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiHzsA_j7ucjKB1U9exdbi5z7EJkli3ANi6_VHkReRgLtWHlvo-wM4UQXE1NSnELxfUI4ViOizpwIvJONF2AsWI3TX7LoXPxK9pic1HQWNfoQ5hXCOs9l175jcaG_btXOaNay-9qsHPOSWabAdujbtirXVx73CPDPVEV5GYFuCRgb-_b01exx6d/s400/Screenshot%202026-05-04%20at%2012.17.34.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;The two maps differ in how closely the curve fits the points, which is determined by the value of alpha (&lt;em&gt;α&lt;/em&gt;) used to compute the shape. The smaller the value the tighter the fit. The first map used &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/geo.php?country=IN&amp;amp;marker=COI-5P&amp;amp;alpha=0.3&amp;amp;limit=10000&quot;&gt;&lt;em&gt;α&lt;/em&gt;=0.3&lt;/a&gt; and is fairly coarse, with &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/geo.php?country=IN&amp;amp;marker=COI-5P&amp;amp;alpha=0.1&amp;amp;limit=10000&quot;&gt;&lt;em&gt;α&lt;/em&gt;=0.1&lt;/a&gt;  we see the alpha shape skirts around Bangladesh, and is hence a better representation of the boundary of India.&lt;/p&gt;
&lt;p&gt;The original Flickr blog post was showing how well geotagged photographs on Flickr were tracing out geographical areas. From my perspective, one reason to make these maps is to spot problematic records. For example, the map for &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/geo.php?country=AU&amp;amp;province=Tasmania&quot;&gt;Tasmania&lt;/a&gt; looks a bit strange. There are records on the Australian mainland, and Lord Howe and Macquarie Islands that clearly aren’t from “Tasmania”. Maybe the coordinates are wrong, maybe the placename is wrong? Either way, we now have some records to investigate.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgTn2SqvXmCnxtHMuv3sjistxNfSLim7dFCAO1w1ewB-dCk95p9uyPKyQWYa6tIN9IbHHAF6ImmtiJbjPSe8i2ywHo2OveEkkMSR962mIsHApytH9tbetcQcTKz8TGfzCBiEa0VPn2EVoP3rcUNx7XSxA80vnZ8VpZqRbNJUwjc_HZ4Zi1IYFPd/s1728/Screenshot%202026-05-04%20at%2012.23.38.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1618&quot; data-original-width=&quot;1728&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgTn2SqvXmCnxtHMuv3sjistxNfSLim7dFCAO1w1ewB-dCk95p9uyPKyQWYa6tIN9IbHHAF6ImmtiJbjPSe8i2ywHo2OveEkkMSR962mIsHApytH9tbetcQcTKz8TGfzCBiEa0VPn2EVoP3rcUNx7XSxA80vnZ8VpZqRbNJUwjc_HZ4Zi1IYFPd/s400/Screenshot%202026-05-04%20at%2012.23.38.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This project is live on the &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/geo.php&quot;&gt;BOLD View web site&lt;/a&gt;, it was mostly written using Claude Code, making use of the GIS features in Postgres. It is an example of how easy AI tools make it to do some quick exploration of an idea (in this case, something inspired by a blog post that is nearly twenty years old).&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2421586361991997402'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2421586361991997402'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/05/alpha-shapes-and-dna-barcoding.html' title='Alpha shapes and DNA barcoding'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjt_TiX10YpmYTG8qoEhL06XtwHeGpDKYZ6BEJ0yWeerrhDCAlPSYZt44MDqEXaL8kmCk1ftsNXt_NluBwxncmiGZZ6yk7z-g8LErOyQvJkDG4mzsj6jaBsUWnjOFzRs0hLNw6QGydu7RilgQu4_ul_No6qTh-pxCR7dbF1H8PWr3knGPtoifMb/s72-c/Screenshot%202026-05-04%20at%2012.17.02.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-1698997431278161663</id><published>2026-03-18T12:38:00.969+00:00</published><updated>2026-05-08T10:52:15.565+01:00</updated><title type='text'>SimpleMappr is dead, long live SimpleMappr?</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2026). SimpleMappr is dead, long live SimpleMappr? &lt;a href=&quot;https://doi.org/10.59350/20dk7-8ns92&quot;&gt;https://doi.org/10.59350/20dk7-8ns92&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;David Shorthouse, perhaps best know for his fabulous &lt;a href=&quot;https://bionomia.net&quot;&gt;Bionomia&lt;/a&gt; project is also the author of &lt;a href=&quot;https://www.simplemappr.net&quot;&gt;SimpleMappr&lt;/a&gt; , a web site for generating publication-ready species distribution maps. These maps have appeared in many publications, and also pop up in &lt;a href=&quot;https://www.inaturalist.org/posts/73525-testing-simplemappr&quot;&gt;iNaturalist&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;David has &lt;a href=&quot;https://tinyurl.com/simplemappr&quot;&gt;announced&lt;/a&gt;  that SimpleMappr will be turned off. Obviously not an easy decision for him to take, and sadly yet more evidence of the fragility of a lot of taxonomic infrastructure (as seen in the struggles of both BHL and TAXACOM).&lt;/p&gt;
&lt;p&gt;I don’t use SimpleMappr, but I know that lots of people do, and so I wondered how easy it would be to create a new version (based on David’s code) that could be hosted either on a central site or on people’s own computers The short answer is that it is “easy”, so long as your definition of “easy” includes (a) getting Claude Code to do the bulk of the work, and (b) ignoring most of the more sophisticated features of David’s app.&lt;/p&gt;
&lt;p&gt;Long story short, I have a (somewhat) working version of SimpleMappr running on a cloud server at &lt;a href=&quot;https://simplemappr.cloud&quot;&gt;https://simplemappr.cloud&lt;/a&gt;, source code &lt;a href=&quot;https://github.com/rdmpage/simplemappr-dkr&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgLmfexsd-dA7WzPCKinxn_kHrOI1dw_kff4X7xCq181qnxIY2DZ0lvIIrgvymHU9bQxe9L6wIt7UtFENDNz-IozIZWgnXDI6hgKDR_TsDRwN5woSiIaVg5mStktzOBMY0Ep3oJ5OHTpavVVH88bUCKY6dTjAUFYnIo-hdQu9f_V3qrGaRvdmVh/s2230/Screenshot%202026-03-16%20at%2015.57.52.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;100%&quot; data-original-height=&quot;1750&quot; data-original-width=&quot;2230&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgLmfexsd-dA7WzPCKinxn_kHrOI1dw_kff4X7xCq181qnxIY2DZ0lvIIrgvymHU9bQxe9L6wIt7UtFENDNz-IozIZWgnXDI6hgKDR_TsDRwN5woSiIaVg5mStktzOBMY0Ep3oJ5OHTpavVVH88bUCKY6dTjAUFYnIo-hdQu9f_V3qrGaRvdmVh/s600/Screenshot%202026-03-16%20at%2015.57.52.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;As with &lt;a href=&quot;https://iphylo.blogspot.com/2026/03/using-ai-to-revive-macos-app-to-preview.html&quot;&gt;my previous post&lt;/a&gt;, this project involved forking &lt;a href=&quot;https://github.com/dshorthouse/SimpleMappr&quot;&gt;the original code&lt;/a&gt;, asking Claude to read it, and sketch out a way to move it to a more robust setting, in this case using Docker containers. Early days, but I am delighted how easy (for various values of “easy”) it is to breath new life into old projects.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1698997431278161663'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1698997431278161663'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/03/simplemappr-is-dead-long-live.html' title='SimpleMappr is dead, long live SimpleMappr?'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgLmfexsd-dA7WzPCKinxn_kHrOI1dw_kff4X7xCq181qnxIY2DZ0lvIIrgvymHU9bQxe9L6wIt7UtFENDNz-IozIZWgnXDI6hgKDR_TsDRwN5woSiIaVg5mStktzOBMY0Ep3oJ5OHTpavVVH88bUCKY6dTjAUFYnIo-hdQu9f_V3qrGaRvdmVh/s72-c/Screenshot%202026-03-16%20at%2015.57.52.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-2697246193398993938</id><published>2026-03-18T12:18:09.612+00:00</published><updated>2026-05-08T10:49:09.657+01:00</updated><title type='text'>Using AI to revive a macOS app to preview GIS files</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2026). Using AI to revive a macOS app to preview GIS files. &lt;a href=&quot;https://doi.org/10.59350/rb118-6m142&quot;&gt;https://doi.org/10.59350/rb118-6m142&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;About a decade agho when I was working with &lt;a href=&quot;https://en.wikipedia.org/wiki/Geographic_information_system&quot;&gt;GIS&lt;/a&gt; files, such as &lt;a href=&quot;https://en.wikipedia.org/wiki/Shapefile&quot;&gt;shapefiles&lt;/a&gt;, there was a nice QuickLook plugin for Macs called 1.  GISLook that would show you the corresponding map as an icon.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhQTmq866-LnqGUnOcpXZ8NrxApcAjB7Z1mN1rvH2G6iRqQQV-jWZGUcd45wOf67ooAEWiOtgSFi2I6TtzC6-WfdbJH_7LEEjVtr-jYYRqpRG6ZJoZAXCC1YN56YcLlkGaNOQzxk_CpKEcOGL4xis8nd6xQP0y6SQrrFqIqno6eFe-Vd7F6dw1g/s1134/Screenshot%202026-03-18%20at%2011.54.18.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1134&quot; data-original-width=&quot;828&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhQTmq866-LnqGUnOcpXZ8NrxApcAjB7Z1mN1rvH2G6iRqQQV-jWZGUcd45wOf67ooAEWiOtgSFi2I6TtzC6-WfdbJH_7LEEjVtr-jYYRqpRG6ZJoZAXCC1YN56YcLlkGaNOQzxk_CpKEcOGL4xis8nd6xQP0y6SQrrFqIqno6eFe-Vd7F6dw1g/s400/Screenshot%202026-03-18%20at%2011.54.18.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;macOS keeps evolving, as a result apps become obsolete unless they are continually updated. For small, solo developer projects, this often means the app no longer works. If the code is open source, at it is in &lt;a href=&quot;https://github.com/berniejenny/GISLook-GISMeta&quot;&gt;this case&lt;/a&gt;, then potentially somebody can come a long and revive the project. But, realistically this can be a daunting prospect. I last wrote native macOS code about two decades ago, a lot has changed.&lt;/p&gt;
&lt;p&gt;Indeed, a lot &lt;strong&gt;has&lt;/strong&gt; changed. With tools such as &lt;a href=&quot;https://claude.ai&quot;&gt;Claude Code&lt;/a&gt;, it is possible to point an AI at an old repository and, in effect, say, “build this, but for today’s Macs”.&lt;/p&gt;
&lt;p&gt;In this case, I cloned the original repo, asked Claude to take a look, and then created a new repo &lt;a href=&quot;https://github.com/rdmpage/gis-quicklook&quot;&gt;rdmpage/gis-quicklook&lt;/a&gt; and Claude got to work. Of the original code, only the core file reading functions survive, the interface code has all gone. But after about a day’s messing about, I have a new app that has even more functionality because it supports the &lt;a href=&quot;https://en.wikipedia.org/wiki/GeoJSON&quot;&gt;GeoJSON&lt;/a&gt; format as well.&lt;/p&gt;
&lt;p&gt;You can get the app from the GitHub repo. Note that because it isn’t the App Store you will need to run a onetime command in terminal to get it to work:&lt;/p&gt;
&lt;pre&gt;&lt;code&gt;xattr -cr GISLookApp.app
&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;Here are four GIS files and their preview icons.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhoZk6a6BQcaxW1BHcHs42MHNqCjQpfveB6YLp1LQhR1_YI233pMwujWcWHBYQpxg1ZJVlpq6ckn7moVraFfFA6EkZIkL6MumkSD4N9WBTusVQVC1BNWolY8AH4IKOsez_l58P2XthsLkJYZFAJz2WaO-SwHJoRowH8e2AwM_0LMWjlUPoxLNai/s641/screenshot.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;641&quot; data-original-width=&quot;607&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhoZk6a6BQcaxW1BHcHs42MHNqCjQpfveB6YLp1LQhR1_YI233pMwujWcWHBYQpxg1ZJVlpq6ckn7moVraFfFA6EkZIkL6MumkSD4N9WBTusVQVC1BNWolY8AH4IKOsez_l58P2XthsLkJYZFAJz2WaO-SwHJoRowH8e2AwM_0LMWjlUPoxLNai/s400/screenshot.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I should thank the original author, &lt;a href=&quot;https://berniejenny.info&quot;&gt;Bernie Jenny&lt;/a&gt; at Monash University in Melbourne. If you are at all interested in maps, globes, and cartography, you should look at his web page, it has some very cool stuff. I coundln’t have done this project without his open source (GPL 3.0) code.&lt;/p&gt;
&lt;p&gt;Nor could I have done it without Claude Code. The level of debugging involved in this project was insane, there were log files flying past, Finder cache rebuilds, numerous dead ends and subtle “gotchas”, never mind the obstacle of learning how to support custom Finder icons and previews on a modern Mac.&lt;/p&gt;
&lt;p&gt;This is the larger point, AI makes it possible, at least in principle, to look at an old, abandoned project, perhaps targeting an out of date API, and have a realistic chance of reviving it. That is a real game changer, made possible by a combination of open source and state of the art AI.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2697246193398993938'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2697246193398993938'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/03/using-ai-to-revive-macos-app-to-preview.html' title='Using AI to revive a macOS app to preview GIS files'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhQTmq866-LnqGUnOcpXZ8NrxApcAjB7Z1mN1rvH2G6iRqQQV-jWZGUcd45wOf67ooAEWiOtgSFi2I6TtzC6-WfdbJH_7LEEjVtr-jYYRqpRG6ZJoZAXCC1YN56YcLlkGaNOQzxk_CpKEcOGL4xis8nd6xQP0y6SQrrFqIqno6eFe-Vd7F6dw1g/s72-c/Screenshot%202026-03-18%20at%2011.54.18.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-7627516672420542191</id><published>2026-03-10T14:51:34.605+00:00</published><updated>2026-05-08T10:47:55.497+01:00</updated><title type='text'>Using AI to understand a DNA barcoding mystery</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2026). Using AI to understand a DNA barcoding mystery. &lt;a href=&quot;https://doi.org/10.59350/nbsfn-91m72&quot;&gt;https://doi.org/10.59350/nbsfn-91m72&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMS0QNwWRjO21DdgU6cbv_v2xqYKJRUDnxwRbnZumGVpufQxRAgMhPn9somQbb3CxkZ4X-xWlRJ2EvE3Zvh3xZ-jDRFAajsTdi0uERNE_uy9pw-mqMfDnIXxQ3S_h43QBLuwLty-1lL6IkOG7kyoSZhhx2xMg_ThlKeseTW-jnnj_ZaRxImETG/s640/BIOUG91536-B09+1666033702.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;200&quot; data-original-height=&quot;480&quot; data-original-width=&quot;640&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMS0QNwWRjO21DdgU6cbv_v2xqYKJRUDnxwRbnZumGVpufQxRAgMhPn9somQbb3CxkZ4X-xWlRJ2EvE3Zvh3xZ-jDRFAajsTdi0uERNE_uy9pw-mqMfDnIXxQ3S_h43QBLuwLty-1lL6IkOG7kyoSZhhx2xMg_ThlKeseTW-jnnj_ZaRxImETG/s200/BIOUG91536-B09+1666033702.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;As I continue to add features to &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/&quot;&gt;BOLD-View&lt;/a&gt; I keep coming across interesting cases where something seems not quite right with the data. Typically this prompts further investigation, which typically means going down a rabbit hole. As an example, take barcode &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/record/GMAEA6199-22&quot;&gt;GMAEA6199-22&lt;/a&gt; which was collected by Donald Hobern in Australia and is identified to order level as &lt;a href=&quot;https://en.wikipedia.org/wiki/Strepsiptera&quot;&gt;Strepsiptera&lt;/a&gt; (I thank Donald for this example).&lt;/p&gt;
&lt;p&gt;This barcode falls within a cluster which contains sequences labelled as either strepsipteran (in many cases identified down to species, &lt;em&gt;Elenchus varleyi&lt;/em&gt;) or hempiteran. Almost all the strepsipteran sequences are assigned to a BIN &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/bin/BOLD:ACH2898&quot;&gt;BOLD:ACH2898&lt;/a&gt;. The hemipteran sequences are not assigned to a BIN, even though they are essentially identical to the strepsipteran sequences.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEip-s41DCqeRnO_LJc6kOlZshIxpBh_rVU4Fbjt9bHikOgEQmKf_x-S_6KQgBjHYLNrLLuV-MaryUT7VbWXJvpjGIYWJ7RsDiqHuJwOin9jfbCRifNa_E9w-VAp6Eg5uKOL14HHWMkHmJiOgSrKg8RRt7rL4DrKfBOO_Q6OoFxx9kTemwgI_zJB/s1562/Screenshot%202026-03-10%20at%2013.02.15.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1562&quot; data-original-width=&quot;1516&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEip-s41DCqeRnO_LJc6kOlZshIxpBh_rVU4Fbjt9bHikOgEQmKf_x-S_6KQgBjHYLNrLLuV-MaryUT7VbWXJvpjGIYWJ7RsDiqHuJwOin9jfbCRifNa_E9w-VAp6Eg5uKOL14HHWMkHmJiOgSrKg8RRt7rL4DrKfBOO_Q6OoFxx9kTemwgI_zJB/s400/Screenshot%202026-03-10%20at%2013.02.15.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;A new feature I’ve added to BOLD View’s display for a barcode is a map of all similar sequences, grouped by BINs. For this example the Stepsiptera BIN has a wide geographic range, including central America and South Africa, as well as Australia (through GMAEA6199-22 which is a stepsipteran but not included in the BIN).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjpI8lPVT1GRlS7_pRuu35572_EC2MFVj516sizxcsVMPKk8CgLGUODNjg2fbHr-YeqBeIbVF9UjhC5E630p6JzlV2YMw3mEnRYTktBLeSL0Ra5InixzJIHPvkZqifbZbWuQ0jQ7N4R2Y6HlehzPc5DHpUtBNRB4RsnGOsB9hMd0wiv7jq700R0/s1776/Screenshot%202026-03-10%20at%2013.27.25.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;744&quot; data-original-width=&quot;1776&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjpI8lPVT1GRlS7_pRuu35572_EC2MFVj516sizxcsVMPKk8CgLGUODNjg2fbHr-YeqBeIbVF9UjhC5E630p6JzlV2YMw3mEnRYTktBLeSL0Ra5InixzJIHPvkZqifbZbWuQ0jQ7N4R2Y6HlehzPc5DHpUtBNRB4RsnGOsB9hMd0wiv7jq700R0/s400/Screenshot%202026-03-10%20at%2013.27.25.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;So, we have a widespread BIN, likely to be &lt;em&gt;Elenchus varleyi&lt;/em&gt;, which has identical sequences to those labelled as hemipteran. What is going on?&lt;/p&gt;
&lt;p&gt;Now if you know anything about Strepsiptera you’ll know that they have a pretty bonkers biology, being endoparasites of insects, and male and females have quite different life histories. But as an exercise I decided to ask &lt;a href=&quot;https://claude.ai/&quot;&gt;Claude&lt;/a&gt; to see what it made of this situation. Here is the prompt I used.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;I have an interesting DNA barcoding case I want you to look at. I am uploading part of a phylogeny for DNA barcodes for a strepsiteran Elenchus varleyi. Note that the barcodes mix sequences from Elenchus and Hemiptera. The barcodes come from samples in Australia, South Africa, and central America. I am uploading a Markdown file for a paper with the DOI 10.1071/it9890175. If you need to get information on species distributions then &lt;a href=&quot;https://gbif.org/&quot;&gt;https://gbif.org&lt;/a&gt; is a good source. Given all this information, can you suggest explanations for why we have barcode sequences from Elenchus and Hemiptera, why only some sequences are assigned to a BIN, and why are these barcodes widely distributed.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;I also uploaded the tree picture you see above, and the paper “Descriptions and biological notes of the Australian Elenchidae (Strepsiptera)” &lt;a href=&quot;https://doi.org/10.1071/IT9890175&quot;&gt;doi:10.1071/IT9890175&lt;/a&gt; which described &lt;em&gt;Elenchus varleyi&lt;/em&gt; (I got the DOI for the paper from another of my projects, &lt;a href=&quot;https://bionames.org/names/urn:lsid:organismnames.com:name:772633&quot;&gt;BioNames&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgpO1fQGbzMtFYGJd-x7f-nQwcMEM5SjJFIW8hObv6rWAYU97SsdsXNwctICAt_NX9X5Q7Gy0VXpAuWcWH6vaelOx4I84g7J-iJguUC-JdJskfIrzjvnJ0u-zMiv9_6VYzCUinpZtjwXKxnk7rT_1Ft0aXPic_uFKkxykUQE1b6Z9ArXtaksYD/s3033/Elenchidaepdf.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;3033&quot; data-original-width=&quot;1935&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhgpO1fQGbzMtFYGJd-x7f-nQwcMEM5SjJFIW8hObv6rWAYU97SsdsXNwctICAt_NX9X5Q7Gy0VXpAuWcWH6vaelOx4I84g7J-iJguUC-JdJskfIrzjvnJ0u-zMiv9_6VYzCUinpZtjwXKxnk7rT_1Ft0aXPic_uFKkxykUQE1b6Z9ArXtaksYD/s320/Elenchidaepdf.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;The paper is behind a paywall, and isn&#39;t on SciHub, but is on [ResearchGate](https://www.researchgate.net/publication/248899086), so I downloaded the PDF and used &lt;a href=&quot;https://github.com/datalab-to/marker&quot;&gt;Marker&lt;/a&gt; to convert it to Markdown, which has become the default language of choice for sending documents to AIs.&lt;/p&gt;
&lt;p&gt;Claude thought for a bit and then came back with a summary that basically said:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;the mixture of strepsipteran and hemipteran sequences is likely due to the “hemipteran” barcodes actually being for their strepsipteran parasites rather than the hemipterans themselves.&lt;/li&gt;
&lt;li&gt;give that one of the hosts of &lt;em&gt;Elenchus varleyi&lt;/em&gt; is &lt;a href=&quot;https://en.wikipedia.org/wiki/Sogatella_kolophon&quot;&gt;&lt;em&gt;Sogatella kolophon&lt;/em&gt;&lt;/a&gt;, which is widespread, that would account for the occurrence of essentially identical DNA sequences across separate continents.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Claude noted that a single hemipteran can host multiple strepsipteran individuals, increasing the chance of mistakenly sequencing the pareasite rather than the host. The picture below shows adult neotenic females ♀ of the &lt;em&gt;Elenchus varleyi&lt;/em&gt; in the abdomen of the hemipteran &lt;em&gt;Sogatella kolophon&lt;/em&gt;. (Fig. 14 from &lt;a href=&quot;https://doi.org/10.1071/IT9890175&quot;&gt;doi:10.1071/IT9890175&lt;/a&gt;).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhGMdzaN-a4_3NhtIiuZYskKkq07hM9_Bl2WJdV9RJEGkpKWSBM0pCnyFXnQ-rj0LBLsdtOl66S9bX9JUj6Dy1XyTVoVzryUnM7g_PV0vc9r6XigkigUw9TWVdTyp2d0qYXbjCHdxfUDRlcksdUEh_EqJe1a-WOxvhzURvvXSVdEjEiau4Nia1A/s936/_page_7_Picture_6.jpeg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;641&quot; data-original-width=&quot;936&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhGMdzaN-a4_3NhtIiuZYskKkq07hM9_Bl2WJdV9RJEGkpKWSBM0pCnyFXnQ-rj0LBLsdtOl66S9bX9JUj6Dy1XyTVoVzryUnM7g_PV0vc9r6XigkigUw9TWVdTyp2d0qYXbjCHdxfUDRlcksdUEh_EqJe1a-WOxvhzURvvXSVdEjEiau4Nia1A/s400/_page_7_Picture_6.jpeg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Now, you and I could figure this out with some background biology and reading the paper, but the point here is that Claude could do it more or less automatically, which means we could potentally scale this to look at all “odd” or “anomalous” barcodes and get suggestions as to what has happened. Hence we have the possibility for large-scale database curation (see also “CurateGPT: A flexible language-model assisted biocuration tool” &lt;a href=&quot;https://doi.org/10.48550/arXiv.2411.00046&quot;&gt;doi:10.48550/arXiv.2411.00046&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;That said, we need some things in place. It helps to have links to information on the species and sequences involved, e.g. what paper(s) described the species, what paper(s) published the sequences. In this case I used BioNames to find the DOI for the paper describing &lt;em&gt;Elenchus varleyi&lt;/em&gt; which was behind a paywall. That is likely to be an obstacle to this approach.&lt;/p&gt;
&lt;p&gt;Out of curiousity I explored an alternative scenario, in part inspired by &lt;a href=&quot;https://plazi.org&quot;&gt;Plazi&lt;/a&gt;. Imagine that we can’t share the full text of the paper, but we do have access to the facts or claims the paper makes. Plazi doesn’t quite do this, it extract images and the text of &lt;a href=&quot;https://en.wikipedia.org/wiki/Taxonomic_treatment&quot;&gt;taxonomic treatments&lt;/a&gt;, but let’s imagine we could extract a series of &lt;a href=&quot;https://en.wikipedia.org/wiki/Semantic_triple&quot;&gt;triples&lt;/a&gt; from the paper, such as:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi, parasitises, Sogatella kolophon)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Given a set of these we could potentially still infer the cause of the barcoding patterns we were seeing, but without needing access to the paywalled text (we’d need access to the text to extract the triples, but Claude wouldn’t need to see the text). Of course, this is not a new idea (&lt;a href=&quot;https://www.shuttleworthfoundation.org/thinking/2019/04/01/thinking-pmr-contentmine/&quot;&gt;Peter Murray-Rust&lt;/a&gt; has been advocating for this for a long time).&lt;/p&gt;
&lt;p&gt;But what triples should we extract from a given paper. Out of interest I asked Claude what triples it would need to generate the answer it gave me to the strepsiptera/hemiptera example. Here is what it gave me:&lt;br&gt;
&lt;strong&gt;Parasite–host relationships:&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi, parasitises, Sogatella kolophon)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi, parasitises, Toya drope)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi, host_family, Delphacidae)&lt;/li&gt;
&lt;li&gt;(Sogatella kolophon, is_majority_host_of, Elenchus varleyi)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Life cycle and tissue intimacy (critical for the contamination argument):&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi female, is_obligate_endoparasite_of, adult host)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi female, life_strategy, neotenic/larviform)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi female, spends_entire_life_in, host body)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi male, develops_as, endoparasitic larva)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi female, extrudes_only_in, adult hosts)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Superparasitism (strengthens contamination argument):&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi, exhibits, superparasitism)&lt;/li&gt;
&lt;li&gt;(superparasitism, maximum_observed, six parasitoids per host)&lt;/li&gt;
&lt;li&gt;(superparasitism, frequency, common in Brisbane)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Dispersal mechanism:&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi female, dispersed_by, macropterous adult hosts)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi larvae, dispersed_by, macropterous adult hosts)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi male pupa, extrudes_in, both nymphs and macropterous adults)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Taxonomic and geographic scope:&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus, distribution, cosmopolitan)&lt;/li&gt;
&lt;li&gt;(Elenchus, only_parasitises, Delphacidae)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi, type_locality, Brisbane Queensland)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi, distribution, Queensland NSW Canberra Northern Territory)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Supporting observation (supporting wide distribution argument):&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Perkins 1907, observed, Elenchus from widely separated countries show very little difference)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Now I think that some of these triples we could dispense with. I asked Claude and it said the following would be enough:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;(Elenchus varleyi, parasitises, Sogatella kolophon)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi female, is_obligate_endoparasite_of, adult host)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi, host_family, Delphacidae)&lt;/li&gt;
&lt;li&gt;(Elenchus varleyi female, dispersed_by, macropterous adult hosts)&lt;/li&gt;
&lt;li&gt;(Elenchus, distribution, cosmopolitan)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;These are fairly simple statements that could liklely be extracted from the paper. That raises issues of how we do that, and how we express those triples. I asked Claude for suggested vocabularies, it mentioned &lt;a href=&quot;https://dwc.tdwg.org/terms/&quot;&gt;Darwin Core&lt;/a&gt; and the &lt;a href=&quot;https://www.ebi.ac.uk/ols4/ontologies/ro&quot;&gt;Relations Ontology&lt;/a&gt;, among others. I am congenitally averse to big, verbose ontologies so I’d prefer something light weight, and maybe Darwin Core would be enough?&lt;/p&gt;
&lt;p&gt;So, in summary, I’m encouraged by the way Claude suggested a plausible explanation for the pattern in the barcode tree, and that it might not always need access to full text to do so (although I suspect giving an LLM access to full text is likely to beat giving it a set of triples that might not encompass all the relevant information in the paper). This also gives me a further incentive to work on the problem of providing context for each barcode, especially the scientific papers that published the sequences, and the papers that published the taxonomic names.&lt;/p&gt;

&lt;p&gt;But one problem still remains. How do we get all this information back into &lt;a href=&quot;https://portal.boldsystems.org&quot;&gt;BOLD&lt;/a&gt; so that a user looking at these sequences knows what is going on, knows that &quot;Hemiptera&quot; doesn&#39;t mean &quot;Hemiptera&quot; in this case, and that what we are seeing is a case of a widespread insect host being infected by a widespread parasite, which was originally described from Australia. The ability to add annotations and thrid party analyses will become crucial if people are to get the most out of DNA barcoding databases.

&lt;h2&gt;References&lt;/h2&gt;
&lt;ul&gt;
    &lt;li&gt;Caufield, H., Kroll, C., O’Neil, S. T., Reese, J. T., Joachimiak, M. P., Hegde, H., Harris, N. L., Krishnamurthy, M., McLaughlin, J. A., Smedley, D., Haendel, M. A., Robinson, P. N., &amp; Mungall, C. J. (2024). CurateGPT: A flexible language-model assisted biocuration tool (arXiv:2411.00046). arXiv. &lt;a href=&quot;https://doi.org/10.48550/arXiv.2411.00046&quot;&gt;https://doi.org/10.48550/arXiv.2411.00046&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;Kathirithamby, J. (1989). Descriptions and biological notes of the Australian Elenchidae (Strepsiptera). Invertebrate Taxonomy, 3(2), 175–195. &lt;a href=&quot;https://doi.org/10.1071/it9890175&quot;&gt;https://doi.org/10.1071/it9890175&lt;/a&gt;&lt;/li&gt;

&lt;/ul&gt;

&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7627516672420542191'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7627516672420542191'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/03/using-ai-to-understand-dna-barcoding.html' title='Using AI to understand a DNA barcoding mystery'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMS0QNwWRjO21DdgU6cbv_v2xqYKJRUDnxwRbnZumGVpufQxRAgMhPn9somQbb3CxkZ4X-xWlRJ2EvE3Zvh3xZ-jDRFAajsTdi0uERNE_uy9pw-mqMfDnIXxQ3S_h43QBLuwLty-1lL6IkOG7kyoSZhhx2xMg_ThlKeseTW-jnnj_ZaRxImETG/s72-c/BIOUG91536-B09+1666033702.jpg" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-4145486131192859157</id><published>2026-02-15T12:20:16.381+00:00</published><updated>2026-03-12T12:19:05.090+00:00</updated><title type='text'>GBIF Geocoder: using GBIF to find places on a map</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2026). GBIF Geocoder: using GBIF to find places on a map &lt;a href=&quot;https://doi.org/10.59350/7g6pt-3mz06 &quot;&gt;https://doi.org/10.59350/7g6pt-3mz06 &lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;I’ve relaunched a “toy” tool that I made a while ago to help geocode localitiies using GBIF. &lt;a href=&quot;https://en.wikipedia.org/wiki/Geocode&quot;&gt;Geocoding&lt;/a&gt; converts a text string, such as “Cambodia: Ratanakiri Province” into latitude and longitude coordinates. For some reason, the biodiversity community typically refers to this as “georeferencing”, which is usually defined as locating an image of a map (see &lt;a href=&quot;https://en.wikipedia.org/wiki/Georeferencing&quot;&gt;Wikipedia entry for georeferencing&lt;/a&gt;, and &lt;a href=&quot;https://allmaps.org&quot;&gt;Allmaps&lt;/a&gt; for some great examples).&lt;/p&gt;
&lt;p&gt;You can try GBIF Geocoder at &lt;a href=&quot;https://rdmpage.github.io/gbif-geocoder/&quot;&gt;https://rdmpage.github.io/gbif-geocoder/&lt;/a&gt;. Code is available on GitHub at &lt;a href=&quot;https://github.com/rdmpage/gbif-geocoder&quot;&gt;https://github.com/rdmpage/gbif-geocoder&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi3RJpQNgbzF8_q7Xg4rBSFn89iU41-NPzsrsJRf8K9yxKLBF_8Vm9RkVo8E_Qpy1idtEhPwlPGdzo-h6pnOiEfcYXhGRgvtfL_3yZxmpf4a8iUBV5htRhQJ5UfSGjCNx15tspXQnHurJ3ZFvqBwy93PsRw2MHnWPDoR_WXqye1EWmzqM0fK5os/s880/Screenshot%202026-02-15%20at%2012.13.45.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;880&quot; data-original-width=&quot;717&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi3RJpQNgbzF8_q7Xg4rBSFn89iU41-NPzsrsJRf8K9yxKLBF_8Vm9RkVo8E_Qpy1idtEhPwlPGdzo-h6pnOiEfcYXhGRgvtfL_3yZxmpf4a8iUBV5htRhQJ5UfSGjCNx15tspXQnHurJ3ZFvqBwy93PsRw2MHnWPDoR_WXqye1EWmzqM0fK5os/s400/Screenshot%202026-02-15%20at%2012.13.45.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;The idea behind the “GBIF Geocoder” is that &lt;a href=&quot;https://www.gbif.org&quot;&gt;GBIF&lt;/a&gt; has a huge number of geocoded specimens, and hence if you are looking for coordinates for a locality there is a good chance that somebody has already found them. So, all we need to do is search GBIF for specimens with localities that match the place you are trying to geocode. I created a version of this tool in 2018, mentioning it in a blog post &lt;a href=&quot;https://doi.org/10.59350/d8dwz-3v524&quot;&gt;GBIF at 1 billion - what’s next?&lt;/a&gt;, and wrote it up in a short note in bioRxiv &lt;a href=&quot;https://doi.org/10.1101/469650&quot;&gt;Geocoding genomic databases using GBIF&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;The original version was hosted on &lt;a href=&quot;https://glitch.com&quot;&gt;Glitch&lt;/a&gt;, a wonderful platform where people to create pretty much anything using HTML and Javascript. &lt;a href=&quot;https://blog.glitch.com/post/goodbye-glitch&quot;&gt;Glitch is no more&lt;/a&gt; so I’ve finally got around to rebuilding it, inspired by this &lt;a href=&quot;https://bsky.app/profile/tapani-hopkins.bsky.social/post/3met3bu3x6s2l&quot;&gt;post on Bluesky&lt;/a&gt; by Tapani Hopkins:&lt;/p&gt;

&lt;blockquote class=&quot;bluesky-embed&quot; data-bluesky-uri=&quot;at://did:plc:xsrx3wrabk4ie3plgvqzim3a/app.bsky.feed.post/3met3bu3x6s2l&quot; data-bluesky-cid=&quot;bafyreierxmyoyosmsgpdbzsvvhdcrlmai4rx7343elko4626sljqv6tpwy&quot; data-bluesky-embed-color-mode=&quot;system&quot;&gt;&lt;p lang=&quot;en&quot;&gt;Next quest: figure out from this map where &amp;quot;La Maboke&amp;quot; was.

Searching for &amp;quot;Maboke, Central African Republic&amp;quot; turned out not to work like I&amp;#x27;d hoped for. 😅 Though perhaps I should save the recipe. 🍲🐟&lt;br&gt;&lt;br&gt;&lt;a href=&quot;https://bsky.app/profile/did:plc:xsrx3wrabk4ie3plgvqzim3a/post/3met3bu3x6s2l?ref_src=embed&quot;&gt;[image or embed]&lt;/a&gt;&lt;/p&gt;&amp;mdash; Tapani Hopkins (&lt;a href=&quot;https://bsky.app/profile/did:plc:xsrx3wrabk4ie3plgvqzim3a?ref_src=embed&quot;&gt;@tapani-hopkins.bsky.social&lt;/a&gt;) &lt;a href=&quot;https://bsky.app/profile/did:plc:xsrx3wrabk4ie3plgvqzim3a/post/3met3bu3x6s2l?ref_src=embed&quot;&gt;Feb 14, 2026 at 13:00&lt;/a&gt;&lt;/blockquote&gt;&lt;script async src=&quot;https://embed.bsky.app/static/embed.js&quot; charset=&quot;utf-8&quot;&gt;&lt;/script&gt;

&lt;p&gt;The original project used &lt;a href=&quot;https://nodejs.org/&quot;&gt;node.js&lt;/a&gt;, whereas I wanted something simple using just HTML and Javascript so it could be hosted using GitHub pages (or, indeed, on any other static hosting platform). I fired up &lt;a href=&quot;https://claude.com/product/claude-code&quot;&gt;Claude Code&lt;/a&gt; to help me with the port. I continue to be amazed at just how much fun this style of coding is, and the power of the tools. I make requests and suggestions, and Claude will fire up an instance of Google Chrome to check that the code works. I think a key feature of this style of programming is that it can reduce that inital hurdle when you know you need to make changes, and may even have made notes to yourself about what needs to be done, but there will the initial tedium of reworking old code to work with a new platform i.e., Googling questions, re-reading GitHub docs, etc. Instead, I get to focus on what I want to do, namely revive an old tool that I think people may find useful.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/4145486131192859157'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/4145486131192859157'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2026/02/gbif-geocoder-using-gbif-to-find-places.html' title='GBIF Geocoder: using GBIF to find places on a map'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi3RJpQNgbzF8_q7Xg4rBSFn89iU41-NPzsrsJRf8K9yxKLBF_8Vm9RkVo8E_Qpy1idtEhPwlPGdzo-h6pnOiEfcYXhGRgvtfL_3yZxmpf4a8iUBV5htRhQJ5UfSGjCNx15tspXQnHurJ3ZFvqBwy93PsRw2MHnWPDoR_WXqye1EWmzqM0fK5os/s72-c/Screenshot%202026-02-15%20at%2012.13.45.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-982096422231733819</id><published>2025-11-19T12:24:21.337+00:00</published><updated>2025-11-19T12:27:51.113+00:00</updated><title type='text'>Model Context Protocol (MCP) and triple stores: natural language queries for knowledge graphs</title><content type='html'>&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjkhGZnYlSK-Q5vZuFwygni4v9giy-edX7IFbjhCWjMenejpGkwPT4R0UMQO2AucgpqFYcHRkoMcxxwv6SuxtLoWgDahVhDSV2tsiJ1WTxWBJKt0T80XxYAdk-FPJ0NQ6HGQ9sSBm8g2hX2Ovuono7vDyL944Ab3R1RemjmijfTSP9t4-CidheV/s225/Unknown.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;128&quot; data-original-height=&quot;225&quot; data-original-width=&quot;225&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjkhGZnYlSK-Q5vZuFwygni4v9giy-edX7IFbjhCWjMenejpGkwPT4R0UMQO2AucgpqFYcHRkoMcxxwv6SuxtLoWgDahVhDSV2tsiJ1WTxWBJKt0T80XxYAdk-FPJ0NQ6HGQ9sSBm8g2hX2Ovuono7vDyL944Ab3R1RemjmijfTSP9t4-CidheV/s320/Unknown.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;Some quick notes based on experiments with &lt;a href=&quot;https://modelcontextprotocol.io&quot;&gt;Model Context Protocol (MCP)&lt;/a&gt; and (Claude](&lt;a href=&quot;https://claude.ai&quot;&gt;https://claude.ai&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;Model Context Protocol (MCP) is all the rage right now, and I’ve been slow to take a look. Kingsley Idehen recently wrote &lt;a href=&quot;https://www.linkedin.com/pulse/semantic-web-project-didnt-fail-waiting-ai-yin-its-yang-idehen-j01se/&quot;&gt;The Semantic Web Project Didn’t Fail — It Was Waiting for AI (The Yin of its Yang)&lt;/a&gt; where he argued that Large Language Models (LLMs) provide (finally) a user-friendly way to query triple stores (i.e., knowledge graphs). Instead of simply presenting users with an empty SPARQL query box, we can now formulate a query in natural language and have AI convert that into SPARQL.&lt;/p&gt;
&lt;p&gt;That eases the challenge of learning a new query language, but it get’s better. MCP enables us to connect an AI with another service. It acts a bit like a broker. You tell the AI what you want to do, the AI talks to the MCP server to figure out how to do what you want, gets the results, then converts them into a natural language (or other format) result that you can use. hence you can have a conversation with a knowledge graph!&lt;/p&gt;
&lt;p&gt;There are examples of MCP servers that speak SPARQL, such as &lt;a href=&quot;https://github.com/ekzhu/mcp-server-sparql&quot;&gt;MCP Server SPARQL&lt;/a&gt; by&lt;br&gt;
Eric Zhu. Since I mostly program in PHP (gasp) version 7 (gasp) I ended up asking ChatGPT to help write a simple MCP server. There then followed a dance between ChatGPT and Claude where ChatGPT would very confidently declare that the code was done, and Claude would get increasingly exasperated that I appeared to be trying to do something that wasn’t working. I eventually had to tell Claude to back the f**k off with its snarky comments and maybe be more helpful in its messages. Eventually I got a simple server up and running.&lt;/p&gt;
&lt;p&gt;The code &lt;a href=&quot;https://github.com/rdmpage/php-mcp-server&quot;&gt;php-mcp-server&lt;/a&gt; is very basic, but supports SPARQL queries running on an instance of &lt;a href=&quot;https://github.com/oxigraph/oxigraph&quot;&gt;Oxigraph&lt;/a&gt; that runs on my Mac.  For example, I can ask:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;What publication cites the sequence &lt;a href=&quot;https://identifiers.org/insdc/MH493846&quot;&gt;https://identifiers.org/insdc/MH493846&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;and Claude will respond:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;I’ll query for publications that cite that sequence.&lt;br&gt;
Good! I found a publication. Let me get more details about it:&lt;br&gt;
Perfect! The sequence &lt;a href=&quot;https://identifiers.org/insdc/MH493846&quot;&gt;https://identifiers.org/insdc/MH493846&lt;/a&gt; is cited by:&lt;br&gt;
Publication: “Integrative taxonomy reveals a new Gammarus species (Crustacea, Amphipoda) surviving in a previously unknown southeast European glacial refugium”&lt;br&gt;
Authors: Copilaș-Ciocianu, D., Zimta, A.-A., and Petrusek, A.&lt;br&gt;
DOI: &lt;a href=&quot;https://doi.org/10.1111/jzs.12248&quot;&gt;https://doi.org/10.1111/jzs.12248&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;It will also show me the SPARQL queries it makes to find this information.&lt;/p&gt;
&lt;p&gt;This feels like a game changer. The MCP server I’ve written is incredibly crude, but I can now start to query a knowledge graph about DNA barcodes and associated literature in plain English, and get back useful results.&lt;/p&gt;
&lt;p&gt;What I really want to do is combine this with details on the actual papers (for example, lists of specimens sequenced, whether they are type specimens, where were the samples collected from, etc.) as a way to help curate databases such as BOLD.  I recently released &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com&quot;&gt;BOLD View&lt;/a&gt; (see blog post &lt;a href=&quot;https://doi.org/10.59350/81kzw-qy18&quot;&gt;BOLD View: exploring DNA barcodes&lt;/a&gt;) to make it easier to explore DNAbarcode data, and I’m fascinated by how much scope there is for curation to add taxonomic identifications, geographic location, etc.&lt;/p&gt;
&lt;p&gt;To make this curation eassier I’ve started to assemble a knowledge graph linking barcodes, Genbank sequences, and taxonomic names to the associated scientific literature, with the ultimate goal of being able to ask: “given this barcode that lacks a proper scientific name, is there anything in the published literature that can tell me what it actually is?”. The idea of being able to literally ask that question using a combination of an AI and a MCP server is vert exciting.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/982096422231733819'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/982096422231733819'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/11/model-context-protocol-mcp-and-triple.html' title='Model Context Protocol (MCP) and triple stores: natural language queries for knowledge graphs'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjkhGZnYlSK-Q5vZuFwygni4v9giy-edX7IFbjhCWjMenejpGkwPT4R0UMQO2AucgpqFYcHRkoMcxxwv6SuxtLoWgDahVhDSV2tsiJ1WTxWBJKt0T80XxYAdk-FPJ0NQ6HGQ9sSBm8g2hX2Ovuono7vDyL944Ab3R1RemjmijfTSP9t4-CidheV/s72-c/Unknown.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-7101109488291281703</id><published>2025-08-07T17:55:00.002+01:00</published><updated>2025-08-07T17:57:06.361+01:00</updated><title type='text'>Make Data Count Kaggle Competition</title><content type='html'>&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhW8UtSeX_CpRUr2Ypc1OREc6nQIaTx6bEKwTwf7XRbbTLIVgNKNXLyJmsBmBbCLS2goMwJilNT9av1hM46Kbog6Z99H_vYVpT_KTnq0iKdsxoET5s1zvmD3xwZpc-qYcESAWDJ-ZS_PIrHrFs2bq2iJzxs7fIlSdUTWuCtvgtOu5BdzqYJdk6u/s560/header.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;279&quot; data-original-width=&quot;560&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhW8UtSeX_CpRUr2Ypc1OREc6nQIaTx6bEKwTwf7XRbbTLIVgNKNXLyJmsBmBbCLS2goMwJilNT9av1hM46Kbog6Z99H_vYVpT_KTnq0iKdsxoET5s1zvmD3xwZpc-qYcESAWDJ-ZS_PIrHrFs2bq2iJzxs7fIlSdUTWuCtvgtOu5BdzqYJdk6u/s320/header.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;I’ve written several times here about the &lt;a href=&quot;https://makedatacount.org&quot;&gt;Make Data Count&lt;/a&gt; project and its major output to date, the &lt;a href=&quot;https://doi.org/10.5281/zenodo.11196858&quot;&gt;Data Citation Corpus&lt;/a&gt;, currently at version 4 (see &lt;a href=&quot;https://doi.org/10.60804/drrx-4m69&quot;&gt;The fourth release of the Data Citation Corpus incorporates data citations from Europe PMC and additions to affiliation metadata&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;In June Make Data Count launched a &lt;a href=&quot;https://www.kaggle.com/competitions/make-data-count-finding-data-references/overview&quot;&gt;Kaggle Competition&lt;/a&gt; with the goal of developing a tool that will process articles (in either PDF or XML format), extract data citations (e.g., DOIs for datasets in repositories such as &lt;a href=&quot;https://datadryad.org&quot;&gt;Dryad&lt;/a&gt;, or accession numbers such as &lt;a href=&quot;https://www.rcsb.org/structure/6TAP&quot;&gt;6TAP&lt;/a&gt; in the Protein Data Bank), and classify these citations as either “primary” (data published in that paper) or “secondary” (reuse of existing data&lt;/p&gt;
&lt;p&gt;I think the competition is an excellent idea, and the $US100,000 is a great motivator to get people trying to solve this problem. I’m tacking part in the competition, which has meant learning Python very fast. I’ve dabbled a bit before, but this was a whole new thing. ChatGPT has been indespensible, especially in explaining why something I was doing wasn’t going to work, and what an error message really meant. The whole process became horribly addictive. You can submit a solution on five tiems a day, and the counter resets at midnight GMT, so there were nights I was up well after midnight coding and using up the following day’s submission quota! Another interesting feature is the &lt;a href=&quot;https://www.kaggle.com/competitions/make-data-count-finding-data-references/discussion&quot;&gt;lively discussion&lt;/a&gt; between people that are rivals for substantial prize money. Participants are sharing code and ideas, often not their best scoring ideas — after all, everyone wants to win — but still giving hints and support, and sharing findings.&lt;/p&gt;
&lt;p&gt;The competition provides a small set of training data (about 500 PDFs and a simialr number of XML files). The idea is that you write code to analyse those files and output a list of data citations. You then submit your entry to Kaggle, which runs your code against a “hidden” set of PDFs and XML files and tells you your score. The best score wins prizes. My place in this competiton pretty accurately reflects my skills and ability :)&lt;/p&gt;
&lt;h2 id=&quot;issues-with-the-competition&quot;&gt;Issues with the competition&lt;/h2&gt;
&lt;p&gt;Unfortunately the competition itself has been — how shall I put this — poorly run. There has been virtually no engagement from DataCite in their own competition, despite repeated queries from the entrants to explain the often inexplicable reasoning for the scoring in the training data, or why some of the PDFs are wrong or incomplete.  Some PDFs are preprints, not the actual papers (and may differ in whether they cite data or not). The XML comes in a variety of formats, which we weren’t told about. Some XML was “gold standard” JATS-XML as used by PubMed Central, others were publisher specific, or the output of PDF parsers or  annotation tools.&lt;/p&gt;
&lt;p&gt;I ended up making my own training data (&lt;a href=&quot;https://doi.org/10.34740/kaggle/dsv/12667298&quot;&gt;https://doi.org/10.34740/kaggle/dsv/12667298&lt;/a&gt;) listing what I think are the actual data citations (about twice as many as are in the “official” training data).&lt;/p&gt;
&lt;p&gt;There are some high scoring entries (see the &lt;a href=&quot;https://www.kaggle.com/competitions/make-data-count-finding-data-references/leaderboard&quot;&gt;leaderboard&lt;/a&gt;) so it looks like Make Data Count will get somes useful tools form this competition. My only concern is that these tools may be optimised to replicate the somewhat erratic and poorly described annotation process that DataCite used to create the training and “hidden” test data, rather than accuarately retrieve the actual data citations. Perhaps my concerns will prove unfounded, or maybe the tools can be easily retrained with better data.&lt;/p&gt;
&lt;p&gt;But I am somewhat baffled that such an importasnt project for which Make Data Count have secured funding for serious prize money has been essentially left unattended by the organisers.&lt;/p&gt;
&lt;p&gt;The competition runs until 3 September.&lt;/p&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Page, R. (2024). Problems with the DataCite Data Citation Corpus  &lt;a href=&quot;https://doi.org/10.59350/t80g1-xys37&quot;&gt;https://doi.org/10.59350/t80g1-xys37&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Page, R. (2024). The Data Citation Corpus revisited  &lt;a href=&quot;https://doi.org/10.59350/wvwva-v7125&quot;&gt;https://doi.org/10.59350/wvwva-v7125&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7101109488291281703'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7101109488291281703'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/08/make-data-count-kaggle-competition.html' title='Make Data Count Kaggle Competition'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhW8UtSeX_CpRUr2Ypc1OREc6nQIaTx6bEKwTwf7XRbbTLIVgNKNXLyJmsBmBbCLS2goMwJilNT9av1hM46Kbog6Z99H_vYVpT_KTnq0iKdsxoET5s1zvmD3xwZpc-qYcESAWDJ-ZS_PIrHrFs2bq2iJzxs7fIlSdUTWuCtvgtOu5BdzqYJdk6u/s72-c/header.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-639112564180601539</id><published>2025-07-08T12:04:00.002+01:00</published><updated>2026-05-08T10:54:05.970+01:00</updated><title type='text'>How many times are DNA barcoding datasets cited?</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2025). How many times are DNA barcoding datasets cited? &lt;a href=&quot;https://doi.org/10.59350/s0c6z-2m608&quot;&gt;https://doi.org/10.59350/s0c6z-2m608&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;This note accompanies a dataset that I uploaded to Zenodo (&lt;a href=&quot;https://doi.org/10.5281/zenodo.15824274&quot;&gt;https://doi.org/10.5281/zenodo.15824274&lt;/a&gt;). My goal in creating this dataset is to link data created on the &lt;a href=&quot;https://portal.boldsystems.org&quot;&gt;Barcode of Life Data Systems&lt;/a&gt; to the DOIs for those datasets, and then to link those data DOIs to DOIs for the papers (if any) that created those datasets, and/or cited them.&lt;/p&gt;
&lt;p&gt;For example, the paper “DNA barcodes enable higher taxonomic assignments in the Acari” (Young et al., 2021) cites three barcode datasets: DS-BINFL, DS-5FLR, and DS-10FLR. Each of these datasets has a DOI of the form: &lt;code&gt;https://doi.org/10.5883/&lt;/code&gt; plus the &lt;code&gt;DS&lt;/code&gt; number. One reason I want to m ake these links is so that when the dataset is displayed, say, in my &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com&quot;&gt;BOLD View&lt;/a&gt; app, I could also show the papers that created/cited the dataset, providing some context to the data (e.g., why was the data collected?). Another reason is that once we link data to papers we can do some interesting things, such as assign credit (Zeng et al. 2020), or discover what organisations funded the work. I hope to explore these topics in the future.&lt;/p&gt;
&lt;p&gt;Matching datasets to publications was a tedious process, there are more details on the &lt;a href=&quot;https://github.com/rdmpage/bold-dataset-citations&quot;&gt;GitHub repository&lt;/a&gt;. I started with a Google Scholar search, then did lots of manual filtering and cleaning. Most of the articles have DOIs, and only these articles are included in the Zenodo dataset, which is intended as a contribution to &lt;a href=&quot;https://makedatacount.org&quot;&gt;Make Data Count&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;This only scratches the surface of what could be done. There are many datasets that I could not find in the literature (they may never have been cited). I also want to retrieve links between individual DNA barcodes and the papers that published them. Apart from context and metrics, I’m also interested in whether these papers might contain more detailed information about the sequences, such as geographic localities. In this way we could potentially enrich the BOLD database, as part of the “virtuous cycle” envisioned by David Schindel (Schindel and Page, 2024).&lt;/p&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;p&gt;Page, R. (2025). Citations of datasets published by Barcode of Life Data Systems (BOLD) [Data set]. Zenodo. &lt;a href=&quot;https://doi.org/10.5281/zenodo.15824274&quot;&gt;https://doi.org/10.5281/zenodo.15824274&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Schindel, D. E., &amp;amp; Page, R. M. P. (2024). Creating Virtuous Cycles for DNA Barcoding: A Case Study in Science Innovation, Entrepreneurship, and Diplomacy. DNA Barcoding, 7–32. &lt;a href=&quot;https://doi.org/10.1007/978-1-0716-3581-0_1&quot;&gt;https://doi.org/10.1007/978-1-0716-3581-0_1&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Young, M. R., deWaard, J. R., &amp;amp; Hebert, P. D. N. (2021). DNA barcodes enable higher taxonomic assignments in the Acari. Scientific Reports, 11(1). &lt;a href=&quot;https://doi.org/10.1038/s41598-021-95147-8&quot;&gt;https://doi.org/10.1038/s41598-021-95147-8&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Zeng, Tong, Longfeng Wu, Sarah Bratt, and Daniel E. Acuna. ‘Assigning Credit to Scientific Datasets Using Article Citation Networks’. Journal of Informetrics 14, no. 2 (1 May 2020): 101013. &lt;a href=&quot;https://doi.org/10.1016/j.joi.2020.101013&quot;&gt;https://doi.org/10.1016/j.joi.2020.101013&lt;/a&gt;.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/639112564180601539'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/639112564180601539'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/07/how-many-times-are-dna-barcoding.html' title='How many times are DNA barcoding datasets cited?'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-2928368343736074927</id><published>2025-06-05T12:52:00.004+01:00</published><updated>2025-06-12T17:02:03.623+01:00</updated><title type='text'>A metabarcoding mess and the importance of just looking at the data</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2025). A metabarcoding mess and the importance of just looking at the data. &lt;a href=&quot;https://doi.org/10.59350/q2v8n-wc488&quot;&gt;https://doi.org/10.59350/q2v8n-wc488&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;Here I summarise a few posts on Bluesky where I raised concerns about some metadabarcoding datasets that were highlighted by GBIF:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&amp;gt;3.4 million insect records based on DNA metabarcoding of bulk samples from #Sweden and #Madagascar have been mobilized to GBIF thanks to collaborative efforts of research institutions led by the #NaturhistoriskaRiksmuseet &lt;a href=&quot;https://bsky.app/profile/gbif.org/post/3lqrbvruyih2g&quot;&gt;link&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Looking at these datasets it’s clear that &lt;a href=&quot;https://bsky.app/profile/rdmpage.bsky.social/post/3lqs23q7dca23&quot;&gt;something is wrong&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&quot;data&quot;&gt;Data&lt;/h2&gt;
&lt;p&gt;The datasets discussed are for CO1 Amplicon Sequence Variants from Madagascar, which are part of the &lt;a href=&quot;https://www.insectbiomeatlas.org&quot;&gt;Insect Biome Atlas&lt;/a&gt; project. The data is described in Miraldo et al. &lt;a href=&quot;https://doi.org/10.1038/s41597-025-05151-0&quot;&gt;https://doi.org/10.1038/s41597-025-05151-0&lt;/a&gt;. There are two datasets for Madagascar:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;CO1 Amplicon Sequence Variants of leaf litter arthropod communities collected at Malaise traps from the Insect Biome Atlas project in Madagascar &lt;a href=&quot;https://doi.org/10.15468/pad7pc&quot;&gt;https://doi.org/10.15468/pad7pc&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;CO1 Amplicon Sequence Variants of bulk arthropod samples (mild lysis) collected with Malaise traps from the Insect Biome Atlas project in Madagascar &lt;a href=&quot;https://doi.org/10.15468/6u5rum&quot;&gt;https://doi.org/10.15468/6u5rum&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;In case the data changes in the future I’ve made snapshots of the two datasets and uploaded them to Zenodo &lt;a href=&quot;https://doi.org/10.5281/zenodo.15599342&quot;&gt;doi:10.5281/zenodo.15599342&lt;/a&gt;. The files I downloaded (&lt;a href=&quot;https://doi.org/10.15468/dl.kwjyjt&quot;&gt;https://doi.org/10.15468/dl.kwjyjt&lt;/a&gt; and &lt;a href=&quot;https://doi.org/10.15468/dl.2p3z5q&quot;&gt;https://doi.org/10.15468/dl.2p3z5q&lt;/a&gt;) are the  GBIF annotated archives, hence they include the mapping between the taxonomic names and GBIF’s backbone taxonomy.&lt;/p&gt;
&lt;h2 id=&quot;problem&quot;&gt;Problem&lt;/h2&gt;
&lt;p&gt;In browsing the data on GBIF I noticed some striking distribution patterns: insects normally found in Europe and/or North America were also turning up in Madagascar, based solely on these metabarcoding datasets. For example, &lt;a href=&quot;https://www.gbif.org/species/5069511&quot;&gt;Helina impuncta&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;img src=&quot;https://github.com/rdmpage/insect-biome-atlas/raw/main/images/Helina_impuncta.png&quot; alt=&quot;Helina impuncta&quot; width=&quot;100%&quot;&gt;&lt;/p&gt;
&lt;p&gt;Metadata barcoding data can be a complicated beast, especially if you try and navigate the multiple databases that house metadata on the sampling program and the output of sequencing machines. For example, GBIF occurrence &lt;a href=&quot;https://www.gbif.org/occurrence/5162479277&quot;&gt;5162479277&lt;/a&gt; is linked to ENA record &lt;a href=&quot;https://www.ebi.ac.uk/ena/browser/view/ERR12944764&quot;&gt;ERR12944764&lt;/a&gt; which in turn has multiple identifier links:&lt;/p&gt;

&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Study Accession&lt;/th&gt;
&lt;th&gt;Sample Accession&lt;/th&gt;
&lt;th&gt;Experiment Accession&lt;/th&gt;
&lt;th&gt;Run Accession&lt;/th&gt;
&lt;th&gt;Tax Id&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;PRJEB61109&lt;/td&gt;
&lt;td&gt;SAMEA115499645&lt;/td&gt;
&lt;td&gt;ERX12317105&lt;/td&gt;
&lt;td&gt;ERR12944764&lt;/td&gt;
&lt;td&gt;1234904&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;p&gt;What’s nice about the GBIF datasets that they wrap all this up into a single package that we can explore. BLASTing a few sequences in these datasets suggests that the identifications of these sequences were probably correct, so the source of the problematic maps lies elsewhere.&lt;/p&gt;
&lt;h2 id=&quot;lots-of-maps&quot;&gt;Lots of maps&lt;/h2&gt;
&lt;p&gt;I wrote a simple PHP script to read the GBIF dataset, aggregate the GBIF taxon ids (i.e., the GBIF taxa that the sequences were mapped to) and draw a map for each taxon (code is on &lt;a href=&quot;https://github.com/rdmpage/insect-biome-atlas&quot;&gt;GitHub&lt;/a&gt;) . These maps use &lt;a href=&quot;https://techdocs.gbif.org/en/openapi/v2/maps&quot;&gt;GBIF’s maps API&lt;/a&gt; to retrieve a tile (256 x 256 pixels) showing the distribution of each taxon on a global map (i.e., zoom level 0 on a &lt;a href=&quot;https://en.wikipedia.org/wiki/Tiled_web_map&quot;&gt;tiled web map&lt;/a&gt;). I overlay that on a GBIF base map tile (see &lt;a href=&quot;https://tile.gbif.org/ui/&quot;&gt;Base Map Tiles&lt;/a&gt;), and dump the output as HTML.&lt;/p&gt;
&lt;p&gt;This is crude but gives a quick visual overview of the data. For the litter datasets there are a lot of these Euro-Madagascar distributions:&lt;/p&gt;
&lt;p&gt;&lt;img src=&quot;https://github.com/rdmpage/insect-biome-atlas/raw/main/images/litter.png&quot; alt=&quot;litter&quot; width=&quot;100%&quot;&gt;&lt;/p&gt;
&lt;p&gt;For the malaise trap data the results look much more like what I’d expect, lots of taxa restricted to Madagascar.&lt;/p&gt;
&lt;p&gt;&lt;img src=&quot;https://github.com/rdmpage/insect-biome-atlas/raw/main/images/malaise.png&quot; alt=&quot;malaise&quot; width=&quot;100%&quot;&gt;&lt;/p&gt;
&lt;p&gt;But there are still examples of the problematic pattern mentioned above.&lt;/p&gt;
&lt;h2 id=&quot;what-happened&quot;&gt;What happened?&lt;/h2&gt;
&lt;p&gt;In the paper describing the data there is a paragraph discussing contamination:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;As part of data clean-up, it is usually advised to remove ASVs present in negative controls, or the maximum number of reads for those, from the entire dataset71. However, after careful inspection of our negative controls, we noticed that only a few ASVs were persistently showing up in control samples. The majority of ASVs seemed to be arthropod sequences that were present in the bulk samples, and also sporadically present in negative controls in relatively small numbers. This was presumably due to DNA spreading between samples through tiny droplets during sample processing, or to low-level of “index hopping”, leading to incorrect assignment of reads during sequencing, despite the use of double-unique indexes in library preparation72. &lt;a href=&quot;https://www.nature.com/articles/s41597-025-05151-0#:~:text=As%20part%20of,in%20library%20preparation72.&quot;&gt;link&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The paper goes on to discuss possible examples of contamination. Looking at the results I suspect there has been a lot more contamination than the authors allow, especially for the litter dataset.&lt;/p&gt;
&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
&lt;p&gt;These results are preliminary, and I’ve contacted the authors of the paper to see if we can find out what happened. But for me the most obvious conclusions are:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Metabarcoding has the potential to generate a lot of spurious records that may negatively impact databases such as GBIF.&lt;/li&gt;
&lt;li&gt;One of the great features of GBIF is that it enables you to simply &lt;strong&gt;look at the data&lt;/strong&gt;. In an age of automated pipelines and big data I think visualisation is increasingly important. It’s often an easy way to discover that something is not as it should be.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;p&gt;Miraldo, A., Sundh, J., Iwaszkiewicz-Eggebrecht, E. et al. Data of the Insect Biome Atlas: a metabarcoding survey of the terrestrial arthropods of Sweden and Madagascar. Sci Data 12, 835 (2025). &lt;a href=&quot;https://doi.org/10.1038/s41597-025-05151-0&quot;&gt;https://doi.org/10.1038/s41597-025-05151-0&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Page, R. (2025). Snapshot of Insect Biome Atlas data for Madagascar from GBIF [Data set]. Zenodo. &lt;a href=&quot;https://doi.org/10.5281/zenodo.15599342&quot;&gt;https://doi.org/10.5281/zenodo.15599342&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2928368343736074927'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2928368343736074927'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/06/a-metabarcoding-mess-and-important-of.html' title='A metabarcoding mess and the importance of just looking at the data'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-8888971345141917925</id><published>2025-05-16T15:39:00.006+01:00</published><updated>2026-05-08T10:57:03.484+01:00</updated><title type='text'>Tracking changes in DNA barcode BINs</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2025). Tracking changes in DNA barcode BINs. &lt;a href=&quot;https://doi.org/10.59350/h97dq-dat02&quot;&gt;https://doi.org/10.59350/h97dq-dat02&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;Following on from releasing &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com&quot;&gt;BOLD View&lt;/a&gt; I’ve started to explore how the classifcation of DNA barcodes changes over time. BOLD uses the RESL algorithm described in  Ratnasingham &amp;amp; Hebert (2013, 2016) to cluster barcodes into “BINs”. As the number of DNA barcodes grows over time these clusters may change. For example, some clusters may increase in size as barcodes are added, and some clusters may be merged as sequences of intermediate similarity are found that link those BINs. Within the public-facing BOLD portal there is no way to see the history of a BIN (Meier et al., 2022), so I decided to explore this. I downloaded  of data packages from BOLD for the period 2022-2024, as well as the BARCODE 500K data for 2016. BOLD issues regular releases of its data, querterly releases are persistent and received a DOI. More regular releases don’t get a DOI and seem to disappear from the web site, but I have a copy of the release for 06-Sep-2024, which  I used to create &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com&quot;&gt;BOLD View&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;The data packages  I’ve used to infer version history are listed below.&lt;/p&gt;

&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Dataset&lt;/th&gt;
&lt;th&gt;DOI&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;iBOLD.31-Dec-2016&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-ibold.31-dec-2016&quot;&gt;10.5883/dp-ibold.31-dec-2016&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.30-Mar-2022&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.30-mar-2022&quot;&gt;10.5883/dp-bold_public.30-mar-2022&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.06-Jul-2022&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.06-jul-2022&quot;&gt;10.5883/dp-bold_public.06-jul-2022&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.28-Sep-2022&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.28-sep-2022&quot;&gt;10.5883/dp-bold_public.28-sep-2022&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.30-Dec-2022&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.30-dec-2022&quot;&gt;10.5883/dp-bold_public.30-dec-2022&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.31-Mar-2023&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.31-mar-2023&quot;&gt;10.5883/dp-bold_public.31-mar-2023&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.30-Jun-2023&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.30-jun-2023&quot;&gt;10.5883/dp-bold_public.30-jun-2023&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.29-Sep-2023&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.29-sep-2023&quot;&gt;10.5883/dp-bold_public.29-sep-2023&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.29-Dec-2023&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.29-dec-2023&quot;&gt;10.5883/dp-bold_public.29-dec-2023&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.29-Mar-2024&lt;/td&gt;
  &lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.29-mar-2024&quot;&gt;10.5883/dp-bold_public.29-mar-2024&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.19-Jul-2024&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://doi.org/10.5883/dp-bold_public.19-jul-2024&quot;&gt;10.5883/dp-bold_public.19-jul-2024&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD_Public.06-Sep-2024&lt;/td&gt;
&lt;td&gt;no DOI&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;h2 id=&quot;versioning&quot;&gt;Versioning&lt;/h2&gt;
&lt;p&gt;I am only interested in a few of the fields in the data, namely ,&lt;code&gt;bin_uri&lt;/code&gt;, &lt;code&gt;identification&lt;/code&gt;, &lt;code&gt;identification_method&lt;/code&gt;, and &lt;code&gt;identified_by&lt;/code&gt;. Note that field names can change between data packages, so we may have to translate field names, or assemble a field’s value from other fields (e.g., taxonomic classification). Rather than store all the data I used &lt;a href=&quot;https://en.wikipedia.org/wiki/Tuple-versioning&quot;&gt;Tuple-versioning&lt;/a&gt; , so that we store values for &lt;code&gt;processid&lt;/code&gt; and the various data fields, together values for &lt;code&gt;valid_from&lt;/code&gt; and &lt;code&gt;valid_to&lt;/code&gt;. The first time a combination of values is found we set &lt;code&gt;valid_from&lt;/code&gt; to the YYYY-MM-DD date of the corresponding data package, and &lt;code&gt;valid_to&lt;/code&gt; to &lt;code&gt;NULL&lt;/code&gt;. Note that we may have multiple barcodes for a given &lt;code&gt;processid&lt;/code&gt; (e.g., for different genes) so we index on both &lt;code&gt;processid&lt;/code&gt; and &lt;code&gt;marker_code&lt;/code&gt;. We also compute a MD5 hash of the data for a barcode to enable fast lookup of a particular set of values. The hash is not sufficient to identify an edit as the same set of values may have more than one period of validity. For example, a barcode may be in one BIN, then move to another, then move back again.&lt;/p&gt;
&lt;p&gt;When we load the first data package (iBOLD.31-Dec-2016) all rows in the database will have NULL values for &lt;code&gt;valid_to&lt;/code&gt;. This signals that those values for the data are currently valid. We then add the remaining data packages from oldest to most recent. For each barcode, if the data for a barcode in the current package is the same as that already in the database (i.e., for which &lt;code&gt;valid_to&lt;/code&gt; is &lt;code&gt;NULL&lt;/code&gt;) we do nothing. But if the data has changed we do the following:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;set &lt;code&gt;valid_to&lt;/code&gt; for the most recent row to the YYYY-MM-DD data of the current data package&lt;/li&gt;
&lt;li&gt;add a new row with &lt;code&gt;valid_from&lt;/code&gt; set to the same date, and &lt;code&gt;valid_to&lt;/code&gt; set to NULL.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;At the end of this process we have a list of values for the selected fields for each barcode, together with the time span that those values were valid.&lt;/p&gt;
&lt;h2 id=&quot;queries&quot;&gt;Queries&lt;/h2&gt;
&lt;p&gt;There are two kinds of queries I’ve explored so far. The first is tracking the changes for an individual barcode, the other is the history of a BIN.&lt;/p&gt;
&lt;h3 id=&quot;barcode-histories&quot;&gt;Barcode histories&lt;/h3&gt;
&lt;p&gt;Here is the history for &lt;strong&gt;XAF587-05&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;2022-03-30 - 2022-09-28&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;identification: &lt;em&gt;Poanes hobomok&lt;/em&gt;&lt;/li&gt;
&lt;li&gt;identified_by: Paul Hebert&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;2022-09-28 - 2024-07-19&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;identification: &lt;em&gt;Lon hobomok&lt;/em&gt;&lt;/li&gt;
&lt;li&gt;identified_by: Paul Hebert&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;2024-07-19 -&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;identification: &lt;em&gt;Lon hobomok&lt;/em&gt;&lt;/li&gt;
&lt;li&gt;identified_by: Paul D.N. Hebert&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;This examples shows that we need to be careful when counting edits to a barcode. We could simply record these as changes in identification and identifier, but is a little more complicated. &lt;em&gt;Poanes hobomok&lt;/em&gt; and &lt;em&gt;Lon hobomok&lt;/em&gt; are synonyms (Cong et al., 2019), so we’ve not changed the taxonomic identification, merely the name. In the absence of a single authoritative source of taxonomic names and synonyms I use TAXMATCH-like rules to “stem” the species names (Boyle, 2013), so that if two values of &lt;code&gt;identification&lt;/code&gt; have the same species epithet (taking into account possible change in gender of the genus name) I treat these as changes in name, not identification. The other change is from “Paul Hebert” to “Paul D.N. Hebert”, which is clearly the same person. I compute the &lt;a href=&quot;https://en.wikipedia.org/wiki/Levenshtein_distance&quot;&gt;Levenshtein distance&lt;/a&gt; between values of &lt;code&gt;identified_by&lt;/code&gt; and treat any value &amp;gt; 5 as a different name (5 was chosen so that “Paul Hebert” to “Paul D.N. Hebert” would be the same).&lt;/p&gt;
&lt;h3 id=&quot;bins&quot;&gt;BINs&lt;/h3&gt;
&lt;p&gt;For BINs reconstruct the history by taking a BIN and finding all barcodes that have, at any point in time, been a member of that BIN. So far the best way I’ve come with to visualise the changes in a BIN is to create a “storyline” (see Liu et al., 2013) where the composition of each BIN is shown at  each timeslice.&lt;/p&gt;
&lt;p&gt;For example, here is the history of BIN &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/bin/BOLD:ABX0491&quot;&gt;BOLD:ABX0491&lt;/a&gt; which contains barcocdes identifiers as &lt;em&gt;Rhamma&lt;/em&gt;, &lt;em&gt;Rhamma anosma&lt;/em&gt;, and &lt;em&gt;Rhamma bilix&lt;/em&gt; (Prieto, et al. 2021).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj1IKfFFhMkm7q7TS4pNhOXGhNisKPxHvryqhfcrlcsPlBspXaXB8ObxZ8uQJChGS9mNMHmsXuirdKJHrxrsdfNlQqFTN59vVC4g7x6B1IV9BoxSXQNQui66efg6XTeG_Gd8joXdyOCcBDaqOgfIOpOUcFr_ZnnbZSjnPml8nmP-ATOO4teTEhq/s1688/graphviz-2.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1688&quot; data-original-width=&quot;821&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj1IKfFFhMkm7q7TS4pNhOXGhNisKPxHvryqhfcrlcsPlBspXaXB8ObxZ8uQJChGS9mNMHmsXuirdKJHrxrsdfNlQqFTN59vVC4g7x6B1IV9BoxSXQNQui66efg6XTeG_Gd8joXdyOCcBDaqOgfIOpOUcFr_ZnnbZSjnPml8nmP-ATOO4teTEhq/s400/graphviz-2.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;The vertical columns are time slices, barcodes in the same BIN are grouped together in coloured rectangles, and the history of each barcode can be traced from left to right. You can see cases where barcodes have moved between BINs (BOLD:ABX0491 gobbled up two smaller BINs). There are also barcodes that were (for one time slice) not in any BIN.&lt;/p&gt;
&lt;p&gt;This visualisation has been challenging to create, I ended up using &lt;a href=&quot;https://graphviz.org&quot;&gt;# Graphviz&lt;/a&gt; as implememted in &lt;a href=&quot;https://dreampuf.github.io/GraphvizOnline&quot;&gt;(https://dreampuf.github.io/GraphvizOnline&lt;/a&gt;).&lt;/p&gt;
&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
&lt;p&gt;This is still early stages, but it looks promising. The next step would be to incorporate it into BOLD View. It might also be interetsing to develop measures of stability of barcode clustering based on how often members move around.&lt;/p&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;ul&gt;
&lt;li&gt;Boyle, B., Hopkins, N., Lu, Z., Raygoza Garay, J. A., Mozzherin, D., Rees, T., Matasci, N., Narro, M. L., Piel, W. H., Mckay, S. J., Lowry, S., Freeland, C., Peet, R. K., &amp;amp; Enquist, B. J. (2013). The taxonomic name resolution service: an online tool for automated standardization of plant names. BMC Bioinformatics, 14(1). &lt;a href=&quot;https://doi.org/10.1186/1471-2105-14-16&quot;&gt;https://doi.org/10.1186/1471-2105-14-16&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Cong, Q., Zhang, J., Shen, J., &amp;amp; Grishin, N. V. (2019). Fifty new genera of Hesperiidae (Lepidoptera).  &lt;em&gt;Insecta Mundi&lt;/em&gt;,  &lt;em&gt;2019&lt;/em&gt;, 0731.  &lt;a href=&quot;https://doi.org/10.5281/zenodo.3677235&quot;&gt;https://doi.org/10.5281/zenodo.3677235&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hebert, P., &amp;amp; Ratnasingham, S. (2016).  &lt;em&gt;Systems, methods, and computer program products for merging a new nucleotide or amino acid sequence into operational taxonomic units&lt;/em&gt;  (United States Patent US20160103958A1).  [&lt;a href=&quot;https://patents.google.com/patent/US20160103958A1&quot;&gt;https://patents.google.com/patent/US20160103958A1&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;Liu, S., Wu, Y., Wei, E., Liu, M., &amp;amp; Liu, Y. (2013). StoryFlow: Tracking the Evolution of Stories.  &lt;em&gt;IEEE Transactions on Visualization and Computer Graphics&lt;/em&gt;,  &lt;em&gt;19&lt;/em&gt;(12), 2436–2445.  &lt;a href=&quot;https://doi.org/10.1109/TVCG.2013.196&quot;&gt;https://doi.org/10.1109/TVCG.2013.196&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Meier, R., Blaimer, B.B., Buenaventura, E., Hartop, E., von Rintelen, T., Srivathsan, A. and Yeo, D. (2022), A re-analysis of the data in Sharkey et al.’s (2021) minimalist revision reveals that BINs do not deserve names, but BOLD Systems needs a stronger commitment to open science. Cladistics, 38: 264-275. &lt;a href=&quot;https://doi.org/10.1111/cla.12489&quot;&gt;https://doi.org/10.1111/cla.12489&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Prieto, C., Faynel, C., Robbins, R., &amp;amp; Hausmann, A. (2021). Congruence between morphology-based species and Barcode Index Numbers (BINs) in Neotropical Eumaeini (Lycaenidae).  &lt;em&gt;PeerJ&lt;/em&gt;,  &lt;em&gt;9&lt;/em&gt;, e11843.  &lt;a href=&quot;https://doi.org/10.7717/peerj.11843&quot;&gt;https://doi.org/10.7717/peerj.11843&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Ratnasingham, S., &amp;amp; Hebert, P. D. N. (2013). A DNA-Based Registry for All Animal Species: The Barcode Index Number (BIN) System.  &lt;em&gt;PLOS ONE&lt;/em&gt;,  &lt;em&gt;8&lt;/em&gt;(7), e66213.  &lt;a href=&quot;https://doi.org/10.1371/journal.pone.0066213&quot;&gt;https://doi.org/10.1371/journal.pone.0066213&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/8888971345141917925'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/8888971345141917925'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/05/tracking-changes-in-dna-barcode-bins.html' title='Tracking changes in DNA barcode BINs'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj1IKfFFhMkm7q7TS4pNhOXGhNisKPxHvryqhfcrlcsPlBspXaXB8ObxZ8uQJChGS9mNMHmsXuirdKJHrxrsdfNlQqFTN59vVC4g7x6B1IV9BoxSXQNQui66efg6XTeG_Gd8joXdyOCcBDaqOgfIOpOUcFr_ZnnbZSjnPml8nmP-ATOO4teTEhq/s72-c/graphviz-2.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-6920828515063524066</id><published>2025-04-11T14:34:00.004+01:00</published><updated>2025-04-11T14:48:40.941+01:00</updated><title type='text'>Future interfaces for the Biodiversity Heritage Library</title><content type='html'>&lt;p&gt;On Wednesday this week (April 9th, 2025) I gave a talk entitled “Future interface(s) for BHL” (the slides are &lt;a href=&quot;https://doi.org/10.6084/m9.figshare.28777868.v1&quot;&gt;on FigShare&lt;/a&gt;) at &lt;a href=&quot;https://about.biodiversitylibrary.org/get-involved/events/bhl-day-2025/&quot;&gt;BHL Day 2025&lt;/a&gt;. My goal was to introduce “BHL-Light”, an exploration of an alternative interface to &lt;a href=&quot;https://www.biodiversitylibrary.org&quot;&gt;the Biodiversity Heritage Library (BHL)&lt;/a&gt;. As some readers may already know, BHL is coming to a crossroads, and so this presentation felt a bit more urgent than my usual “here’s yet another web site I made”.&lt;/p&gt;
&lt;h2 id=&quot;bhl-light&quot;&gt;BHL-Light&lt;/h2&gt;
&lt;p&gt;&lt;a href=&quot;https://bhl-light-806b8b6fa073.herokuapp.com&quot;&gt;BHL-Light&lt;/a&gt; is my attempt to explore other ways of navigating BHL. The current interface is somewhat dated, and I wanted to start from scratch and see what might be possible to create, even for someone with my somewhat limited skills. BHL-Light has only a very small subset of BHL’s content, I’m putting scalability issues to one side so that I can have some fun.&lt;/p&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEga9m_dhCY2fAHa41xVzcPqmqmXn1M2H_flYWlrbBw8SCCsrMU8RDPbNNPa6hetGPQ04VYCu7uJmzxMHtKbxi_-00Z2FpIIrYSfTou1Z-dVCcKKVZOK65rdtBV3NlVlJjPv2xn0EYfpStID5tjAgVsZkdDOjfAcNRuOB2RERCxXv3hHtVTk0H70/s1167/Screenshot%202025-04-04%20at%2011.09.11.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1167&quot; data-original-width=&quot;1076&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEga9m_dhCY2fAHa41xVzcPqmqmXn1M2H_flYWlrbBw8SCCsrMU8RDPbNNPa6hetGPQ04VYCu7uJmzxMHtKbxi_-00Z2FpIIrYSfTou1Z-dVCcKKVZOK65rdtBV3NlVlJjPv2xn0EYfpStID5tjAgVsZkdDOjfAcNRuOB2RERCxXv3hHtVTk0H70/s400/Screenshot%202025-04-04%20at%2011.09.11.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;h2 id=&quot;the-tech-tldr-bhl-was-not-harmed-in-the-making-of-this&quot;&gt;The tech (TL;DR BHL was not harmed in the making of this)&lt;/h2&gt;
&lt;p&gt;Under the hood, BHL-Light stores BHL metadata, OCR text, and layout information as JSON documents in &lt;a href=&quot;https://couchdb.apache.org&quot;&gt;CouchDB&lt;/a&gt; (one of my favourite databases for exploring new ideas).&lt;/p&gt;
&lt;p&gt;BHL serves its images from &lt;a href=&quot;https://archive.org&quot;&gt;Internet Archive&lt;/a&gt;, which is not always available. BHL recently uploaded images to &lt;a href=&quot;https://registry.opendata.aws/bhl-open-data/&quot;&gt;AWS&lt;/a&gt;, but the images there are not currently viewable on the web. So I ended up creating my own image server. I used &lt;a href=&quot;https://www.hetzner.com/storage/object-storage/&quot;&gt;Hetzner’s S3-compatible object storage&lt;/a&gt; for the image files, added &lt;a href=&quot;https://imgproxy.net&quot;&gt;imgproxy&lt;/a&gt; to resize images as needed, and finally put all this behind a &lt;a href=&quot;https://www.cloudflare.com/&quot;&gt;Cloudflare CDN&lt;/a&gt; to speed up image delivery (and reduce traffic to the S3 store, which becomes a real consideration when one is paying for all of this).&lt;/p&gt;
&lt;p&gt;To view BHL content (e.g., books, journal volumes) I wrote my own viewer, modelled loosely on Google Books. I expressly wanted to avoid &lt;a href=&quot;https://iiif.io&quot;&gt;IIIF&lt;/a&gt; because I find IIIF viewers a terrible way to view documents, and for me BHL is all about the text.&lt;/p&gt;
&lt;p&gt;The web site itself is a few PHP scripts to glue everything together, and I’ve tried to avoid using Javascript unless absolutely necessary. HTML + CSS is really powerful these days, so you can do a lot without resorting to Javascript.&lt;/p&gt;
&lt;h2 id=&quot;tour&quot;&gt;Tour&lt;/h2&gt;
&lt;p&gt;In building BHL-Light I’ve wanted a simple interface to concentrate on displaying content as much as possible. I also wanted a cleaner interface, one that is responsive (AKA &quot;mobile friendly&quot;).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dzTIGEKTa3bXeOKT26JXRcaeVXc9XN3R82CR-5FF4kmdsTvUlAlNeDvNY_hc6WfV4dB3pj9HG_jmF0&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;
  
&lt;p&gt;BHL has some extraordinary content. It has works both old and new.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjT-rB_WhH8s6IYna8OeEKyuNolj1SfY9eUKcU0tXuIX9TPfqSgw_A_XBIOGgq2R-xfJAqB1fO_-QP1qz4C5YaNw139aIx-Kc7hHd5NT38KD1lS4OB38H1oHEXtvgsHkcSZdDig-xwp389gL_-gZfVTgGHC33WVCdXUT5dpH6D2tX6Fw-IFRMHz/s1187/Screenshot%202025-04-04%20at%2011.15.32.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1187&quot; data-original-width=&quot;1073&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjT-rB_WhH8s6IYna8OeEKyuNolj1SfY9eUKcU0tXuIX9TPfqSgw_A_XBIOGgq2R-xfJAqB1fO_-QP1qz4C5YaNw139aIx-Kc7hHd5NT38KD1lS4OB38H1oHEXtvgsHkcSZdDig-xwp389gL_-gZfVTgGHC33WVCdXUT5dpH6D2tX6Fw-IFRMHz/s320/Screenshot%202025-04-04%20at%2011.15.32.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgFAL-Japu4uBncDE4cbTaeSY1Kegd-ScPTsdl6JjYE_6cfCg3PknHzuZIppA9wuni3a7JsMoCz-qu2QgkMb24yg4NGK_wVqy3J0Aou0obBO8T4apZyVbU-MO-kOO3qC23j6hUXLhl4GG6LVCbleiUZHgwy-QJtCtTh5ceeAzLx5IF8oBmGhY0F/s1192/Screenshot%202025-04-04%20at%2011.11.51.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1192&quot; data-original-width=&quot;1075&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgFAL-Japu4uBncDE4cbTaeSY1Kegd-ScPTsdl6JjYE_6cfCg3PknHzuZIppA9wuni3a7JsMoCz-qu2QgkMb24yg4NGK_wVqy3J0Aou0obBO8T4apZyVbU-MO-kOO3qC23j6hUXLhl4GG6LVCbleiUZHgwy-QJtCtTh5ceeAzLx5IF8oBmGhY0F/s320/Screenshot%202025-04-04%20at%2011.11.51.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;h3&gt;Text&lt;/h3&gt;

&lt;p&gt;The viewer I built makes it easy to scroll through an item, and also makes text selectable (something you currently can’t do in BHL. This means you can interact with text in the browser, such as using Google Chrome to translate part of the text.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dwPyyCGfFo6FZA0y0IBgEPp515puZAzP_7G0Dkv6MSvz24x8gh4P08IKoI4AT5eT0URvFI9a3GCUvA&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;

&lt;p&gt;It also opens up the possibility of annotation using &lt;a href=&quot;https://hypothes.is&quot;&gt;Hypothes.is&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dyHjd5mD2HC1gjRsD4w6o07KxoQZSnyM1xWU4kw2NrgMcwPI_V3kcUAPmPaBPQDo1inAUrZKrjHbPQ&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;

&lt;h3&gt;Geotagging and maps&lt;/h3&gt;

&lt;p&gt;I also demonstrated pages that had been geotagged. These tags can be extracted and used to create an interactive map.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh1hU9MMR0XRIeoFZmHXA-MiszbFVHz_p_OOGR_WDNaforuPnAavN81OogkID6QsvoZ_D-xRhZF4B_dCSe4EtT2qhijTlCo4ZprmAgmLhDqG_MDy2FoyF8sm9diYO8ltxlNSwmLWZT_uMm5zuWdiNJ0gJrQ_cbGWEoBPZuN7Qi_1AP-S2dmY-bq/s1193/Screenshot%202025-04-04%20at%2011.10.27.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1193&quot; data-original-width=&quot;1074&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh1hU9MMR0XRIeoFZmHXA-MiszbFVHz_p_OOGR_WDNaforuPnAavN81OogkID6QsvoZ_D-xRhZF4B_dCSe4EtT2qhijTlCo4ZprmAgmLhDqG_MDy2FoyF8sm9diYO8ltxlNSwmLWZT_uMm5zuWdiNJ0gJrQ_cbGWEoBPZuN7Qi_1AP-S2dmY-bq/s400/Screenshot%202025-04-04%20at%2011.10.27.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I still haven’t decided on the best way to interact with the map. For example, should we use the map to search for content geographically, or should we search for content and display those results on a map, or both? I ran out of time to resolve this, so for now if you click on the map you see a &lt;a href=&quot;https://h3geo.org&quot;&gt;H3 hexagon&lt;/a&gt; that encloses where you click. The idea is that then the page would display BHL content within that area. Other idea ideas include something like &lt;a href=&quot;https://iphylo.blogspot.com/2019/05/frankenplace-geospatial-search-and.html&quot;&gt;Frankenplace&lt;/a&gt; or &lt;a href=&quot;https://journalmap.org&quot;&gt;JournalMap&lt;/a&gt;.&lt;/p&gt;

&lt;h3&gt;Document layout&lt;/h3&gt;

&lt;p&gt;For me one of the most exciting areas for the future is adding document layout information to BHL content, such that not only can we identify articles, but figures, tables, references, etc. In this way BHL could finally offer something akin to what &lt;a href=&quot;https://plazi.org&quot;&gt;Plazi&lt;/a&gt; can deliver: structured text about species. This has seemed a challenging task, but recent AI developments have been a game changer. In particular, &lt;a href=&quot;https://www.datalab.to&quot;&gt;Datalab&lt;/a&gt; have released powerful and simple-to-use tools that do a very good job of retrieving document structure from scanned pages. I have started to use this on BHL content and display the results on BHL-Light. For example, Datalab makes it almost trivial to identify and extract figures from scanned pages. Below is a comparsion of document layout for a page as inferred from a born-digital PDF by Plazi, and the same page in BHL where it is simply an image, but Datalab&#39;s methods have inferred which bits are text, figures, captions, etc.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqK8f2RpRFXfb8A-lEJRgevnNE6KzXZLaaFAZveF5byW_FEuHpugcPuw-Xa6nzT4GMOWWx1PmRs-5ggAVOBjduSsOdNCJFi6GsYFbKYIas72SEJS6M7kmGJCDyqakaMeE3sPEFQaRekx4ZaleYQj55c_p2o58O9QQKxaF4pkQSMEdpqM-M_6Pw/s998/Screenshot%202025-04-04%20at%2017.44.46.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;998&quot; data-original-width=&quot;725&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqK8f2RpRFXfb8A-lEJRgevnNE6KzXZLaaFAZveF5byW_FEuHpugcPuw-Xa6nzT4GMOWWx1PmRs-5ggAVOBjduSsOdNCJFi6GsYFbKYIas72SEJS6M7kmGJCDyqakaMeE3sPEFQaRekx4ZaleYQj55c_p2o58O9QQKxaF4pkQSMEdpqM-M_6Pw/s320/Screenshot%202025-04-04%20at%2017.44.46.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg1av9KAb2M8ncGYC2JRj6YM4rGqF07RfW0H8oZSDAP_vhxwWOE6krr_5XH-qGmSJS9zijhMWYztq4pLGG5DC1_Yn9d-QMAfXDp-JSbV2myY3sYzddjUhg0WiLTk9Kw5pIF5P5MpDktY19oIKhz-qWc3u190cELrKmwN449Y5rrgeMa1IlUuSME/s1023/Screenshot%202025-04-04%20at%2017.44.37.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1023&quot; data-original-width=&quot;809&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg1av9KAb2M8ncGYC2JRj6YM4rGqF07RfW0H8oZSDAP_vhxwWOE6krr_5XH-qGmSJS9zijhMWYztq4pLGG5DC1_Yn9d-QMAfXDp-JSbV2myY3sYzddjUhg0WiLTk9Kw5pIF5P5MpDktY19oIKhz-qWc3u190cELrKmwN449Y5rrgeMa1IlUuSME/s320/Screenshot%202025-04-04%20at%2017.44.37.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
  
  
&lt;p&gt;One unexpected consequence of building my own image server (see above) is that the task of displaying figures by cropping page images becomes almost trivial. This idea was inspired in part by &lt;a href=&quot;https://doi.org/10.5334/johd.284&quot;&gt;Smits et al.’s approach&lt;/a&gt; of cropping Internet Archive images.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhY7x6jwFW_Ckn2INN0Erw22h3trkhiwnyaobFKZ6xGTunQUcJ9Cq6IQBSU-71vdgSGy2Vi9gJSg0uw9cFNjgdhMRj7u7mT7WAHqIe7iBiADekeNFIcuqWRyrG_kcYEDywh8BUbTl5YDBNg_rnAmDKdl7tbRk2-rJKc8Lhq2bkvfQrrs_LP1INL/s1268/Screenshot%202025-04-04%20at%2017.35.11.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1268&quot; data-original-width=&quot;1170&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhY7x6jwFW_Ckn2INN0Erw22h3trkhiwnyaobFKZ6xGTunQUcJ9Cq6IQBSU-71vdgSGy2Vi9gJSg0uw9cFNjgdhMRj7u7mT7WAHqIe7iBiADekeNFIcuqWRyrG_kcYEDywh8BUbTl5YDBNg_rnAmDKdl7tbRk2-rJKc8Lhq2bkvfQrrs_LP1INL/s400/Screenshot%202025-04-04%20at%2017.35.11.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;h2 id=&quot;what’s-next&quot;&gt;What’s next?&lt;/h2&gt;
&lt;p&gt;There is much to do. BHL-Light is missing many features. It doesn’t make it easy to find content such as articles found by &lt;a href=&quot;https://biostor.org&quot;&gt;BioStor&lt;/a&gt;, the project I started over a decade ago to find articles in BHL. Search is rudimentary at best, and I haven’t tackled taxonomic names yet (but have ideas for this).&lt;/p&gt;
&lt;p&gt;For me BHL-Light is a fun way to explore BHL, and its development has made me even more aware of all the work done to create the current and maintain the BHL portal. Apart from being a play thing for me, I am curious as to whether BHL-Light might be a way to have “BHL-mini” portals, rather like &lt;a href=&quot;https://www.gbif.org/hosted-portals&quot;&gt;GBIF hosted portals&lt;/a&gt;. In this way, we could have views of BHL focused on a particular taxon, institution, person, etc., or localised by language and/or country. Perhaps we could de-extinct past projects such as &lt;a href=&quot;https://en.wikipedia.org/wiki/Biodiversity_Heritage_Library_for_Europe&quot;&gt;BHL-Europe&lt;/a&gt;?&lt;/p&gt;
&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
&lt;p&gt;Page, R.D. Extracting scientific articles from a large digital archive: BioStor and the Biodiversity Heritage Library. BMC Bioinformatics 12, 187 (2011). &lt;a href=&quot;https://doi.org/10.1186/1471-2105-12-187&quot;&gt;https://doi.org/10.1186/1471-2105-12-187&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Page, Roderic (2025). Future interface(s) for BHL. figshare. Presentation. &lt;a href=&quot;https://doi.org/10.6084/m9.figshare.28777868.v1&quot;&gt;https://doi.org/10.6084/m9.figshare.28777868.v1&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Smits, T., Warner, B., Fyfe, P., &amp;amp; Lee, B. C. G. (2025). A Fully-Searchable Multimodal Dataset of the Illustrated London News, 1842–1890. Journal of Open Humanities Data, 11(1), 10. &lt;a href=&quot;https://doi.org/10.5334/johd.284&quot;&gt;https://doi.org/10.5334/johd.284&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6920828515063524066'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6920828515063524066'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/04/future-interfaces-for-biodiversity.html' title='Future interfaces for the Biodiversity Heritage Library'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEga9m_dhCY2fAHa41xVzcPqmqmXn1M2H_flYWlrbBw8SCCsrMU8RDPbNNPa6hetGPQ04VYCu7uJmzxMHtKbxi_-00Z2FpIIrYSfTou1Z-dVCcKKVZOK65rdtBV3NlVlJjPv2xn0EYfpStID5tjAgVsZkdDOjfAcNRuOB2RERCxXv3hHtVTk0H70/s72-c/Screenshot%202025-04-04%20at%2011.09.11.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-1333929251788849413</id><published>2025-02-26T17:31:00.005+00:00</published><updated>2025-11-19T12:19:38.522+00:00</updated><title type='text'>BOLD View: exploring DNA barcodes</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2025). BOLD View: exploring DNA barcodes. &lt;a href=&quot;https://doi.org/10.59350/81kzw-qy18&quot;&gt;https://doi.org/10.59350/81kzw-qy18&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;For a while now I’ve been exploring ways to navigate through DNA barcodes. Over the years I’ve built various “toys” to explore barcodes, such as &lt;a href=&quot;https://iphylo.blogspot.com/2014/03/displaying-dna-barcodes-on-google-maps.html&quot;&gt;Displaying a million DNA barcodes on Google Maps using CouchDB&lt;/a&gt;, built a small scale browser using Elastic search that &lt;a href=&quot;https://iphylo.blogspot.com/2020/10/gbif-challenge-success.html&quot;&gt;had some succes&lt;/a&gt;, and discovered that Postgres can search for DNA sequences and it’s &lt;a href=&quot;https://iphylo.blogspot.com/2023/07/sub-second-searching-of-millions-of-dna.html&quot;&gt;really fast&lt;/a&gt;. At the same time, I’ve bemoaned the &lt;a href=&quot;https://iphylo.blogspot.com/2016/12/ibol-dna-barcodes-in-gbif.html&quot;&gt;challenges of getting barcode data into GBIF&lt;/a&gt;, and the &lt;a href=&quot;https://iphylo.blogspot.com/2024/10/exploring-bolds-dna-barcode-data.html&quot;&gt;current state of BOLD’s data exports&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Over the last few months I’ve been getting a project to the point where it’s usable, and today I’ve released a live version called &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/&quot;&gt;BOLD view&lt;/a&gt;. Why make a portal to DNA barcodes when BOLD have themselves recently released a &lt;a href=&quot;https://portal.boldsystems.org&quot;&gt;new version of their own portal&lt;/a&gt; you might ask? There are two reasons. Making my own forces me to explore the barcode data in some detail, which is eye-opening in places. The second reason is that I want to be able to explore the barcode data at various levels and in different ways.  For example, I want an interactive global map of barcodes.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj3GKg5St5S2-jThsVRtvUbKzPrnlA-PhwprUvGviv6FGCxtG01rirOJvyQs4yZ7xmGu5wcDZCD13NFDun5RepVuOH_xtQiazgkOdcP3-W2YeG6cb6naRXTiihv03ORaKM_juoipXRx748t7hUDd8YI2J7bhpOCQ52ZmTAl6SvGkoxxUj74RloB/s1724/Screenshot%202025-02-26%20at%2017.21.30.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1650&quot; data-original-width=&quot;1724&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj3GKg5St5S2-jThsVRtvUbKzPrnlA-PhwprUvGviv6FGCxtG01rirOJvyQs4yZ7xmGu5wcDZCD13NFDun5RepVuOH_xtQiazgkOdcP3-W2YeG6cb6naRXTiihv03ORaKM_juoipXRx748t7hUDd8YI2J7bhpOCQ52ZmTAl6SvGkoxxUj74RloB/s400/Screenshot%202025-02-26%20at%2017.21.30.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I want to see a DNA barcode in context, including a phylogeny that includes barcodes both within and outside the BIN the barcode belongs too.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhkMEr_AOVPJsnsRz9sxRl3z3E_hDjR-O-KQGS-0u0C7SUgzl1K9jRNrZlDznMOMwp6Bc-LCygJZJjtuYN9OvjchdJIdUe6-NBwRqrKapIhhl2T6qFpLThysill0h855kr7Q8JC2yF8B-YISnG9pmCqCXbcLP-EZEmzziDN3wCvG8Q0ApXCs1YR/s1724/Screenshot%202025-02-26%20at%2017.21.54.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1636&quot; data-original-width=&quot;1724&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhkMEr_AOVPJsnsRz9sxRl3z3E_hDjR-O-KQGS-0u0C7SUgzl1K9jRNrZlDznMOMwp6Bc-LCygJZJjtuYN9OvjchdJIdUe6-NBwRqrKapIhhl2T6qFpLThysill0h855kr7Q8JC2yF8B-YISnG9pmCqCXbcLP-EZEmzziDN3wCvG8Q0ApXCs1YR/s400/Screenshot%202025-02-26%20at%2017.21.54.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I want to make the imagery more visible.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg5JxN_99hQ6NpUFSiSMkGQG0mbx2NNI8ogVXV_JPVaCmPa3xHd7HBHy2QPD33kwZQ-fNY4dDohhIpb231Vl6NyHa3yufQQ-liu9yl4xjdS94ympJxddcs3ZtWO60bP4FieoushDbcLDmjWJSukKUBdzd0Ykj_X8mIkeIzUTN28MenniQ4F_kg3/s1716/Screenshot%202025-02-26%20at%2017.23.19.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1644&quot; data-original-width=&quot;1716&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg5JxN_99hQ6NpUFSiSMkGQG0mbx2NNI8ogVXV_JPVaCmPa3xHd7HBHy2QPD33kwZQ-fNY4dDohhIpb231Vl6NyHa3yufQQ-liu9yl4xjdS94ympJxddcs3ZtWO60bP4FieoushDbcLDmjWJSukKUBdzd0Ykj_X8mIkeIzUTN28MenniQ4F_kg3/s400/Screenshot%202025-02-26%20at%2017.23.19.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I want to be able to navigate the taxonomy underlying the barcodes using tools such as &lt;a href=&quot;https://iphylo.blogspot.com/2021/05/maximum-entropy-summary-trees-to.html&quot;&gt;summary trees&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgBwV5paNoOoe0VYUlTgTnq8fWjPuMrRMbGD8p9V4Uh5QzOY7myapDklfhRgSzDt0flmPNLDIX2BRI1QcKE8RCW5k7b1vE6aVkeGAiHYdZsLtn69Qnu5a0NcTf72fg_9lN15xm0hF7LWnwP5PAcKMwpCktvHh41X2NJmYir0kFoyhLZbn92TqZF/s1726/Screenshot%202025-02-26%20at%2017.23.59.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1410&quot; data-original-width=&quot;1726&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgBwV5paNoOoe0VYUlTgTnq8fWjPuMrRMbGD8p9V4Uh5QzOY7myapDklfhRgSzDt0flmPNLDIX2BRI1QcKE8RCW5k7b1vE6aVkeGAiHYdZsLtn69Qnu5a0NcTf72fg_9lN15xm0hF7LWnwP5PAcKMwpCktvHh41X2NJmYir0kFoyhLZbn92TqZF/s400/Screenshot%202025-02-26%20at%2017.23.59.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I want to be able to input a DNA search and quickly search for matches.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhFZpJw9m5mZyijiZATEvrO4yLG-sMqnv3HGrV__gTw9WOu2OcS9QDG6nXNvxkskJKecpbaG0m_vkPjEVYo6oaQxxqErvb84BeklzSlcf-CfMCogAqOI1nZAW8mRTa3YdChO3GIBrFrWIQJGAr5Z-rt6VrkpAN5J006cUV7-g4VgkjKpJX8xMhb/s1718/Screenshot%202025-02-26%20at%2017.24.49.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1638&quot; data-original-width=&quot;1718&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhFZpJw9m5mZyijiZATEvrO4yLG-sMqnv3HGrV__gTw9WOu2OcS9QDG6nXNvxkskJKecpbaG0m_vkPjEVYo6oaQxxqErvb84BeklzSlcf-CfMCogAqOI1nZAW8mRTa3YdChO3GIBrFrWIQJGAr5Z-rt6VrkpAN5J006cUV7-g4VgkjKpJX8xMhb/s400/Screenshot%202025-02-26%20at%2017.24.49.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I also want to be able to connect the barcodes to the science behind them (who created the barcodes and what questions were they addressing?).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTIy6hCGEl9FPazaTESuK1DG6E6V8mFuetI6sz6VnVuQ0qMY7ofEWC2zOr5EcpMsKi8iL5Ieoqtvt5EXfTHJxoLS9IQNAonhg7vCD_9sOG8dzJaIcM1ZmSBESrVRPH7YTbcq0W4_SzpJVqOPt1X2Y3UEnPnOZC-KnTliZgAXtdVBBoQYYcsfau/s1718/Screenshot%202025-02-26%20at%2017.26.46.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;732&quot; data-original-width=&quot;1718&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTIy6hCGEl9FPazaTESuK1DG6E6V8mFuetI6sz6VnVuQ0qMY7ofEWC2zOr5EcpMsKi8iL5Ieoqtvt5EXfTHJxoLS9IQNAonhg7vCD_9sOG8dzJaIcM1ZmSBESrVRPH7YTbcq0W4_SzpJVqOPt1X2Y3UEnPnOZC-KnTliZgAXtdVBBoQYYcsfau/s400/Screenshot%202025-02-26%20at%2017.26.46.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Abve all, I just want to be able to explore the data. I don’t want donut charts and dashboards. I want to be able to see the data and the connections. There is still much to be done, in particular I want to visualise sequence alignments. We can have a global map, and a global taxonomy, where is the global alignment?&lt;/p&gt;
&lt;p&gt;I hope to work on &lt;a href=&quot;https://bold-view-bf2dfe9b0db3.herokuapp.com/&quot;&gt;BOLD view&lt;/a&gt; further, but for now it is out the door and my spotlight will inevitably turn elsewhere.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1333929251788849413'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1333929251788849413'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2025/02/bold-view-exploring-dna-barcodes.html' title='BOLD View: exploring DNA barcodes'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj3GKg5St5S2-jThsVRtvUbKzPrnlA-PhwprUvGviv6FGCxtG01rirOJvyQs4yZ7xmGu5wcDZCD13NFDun5RepVuOH_xtQiazgkOdcP3-W2YeG6cb6naRXTiihv03ORaKM_juoipXRx748t7hUDd8YI2J7bhpOCQ52ZmTAl6SvGkoxxUj74RloB/s72-c/Screenshot%202025-02-26%20at%2017.21.30.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-6611271714880105357</id><published>2024-10-29T10:54:00.004+00:00</published><updated>2024-11-13T13:01:55.554+00:00</updated><title type='text'>Internet Archive as a single point of failure</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Internet Archive as a single point of failure &lt;a href=&quot;https://doi.org/10.59350/1r3m1-c5e22&quot;&gt;https://doi.org/10.59350/1r3m1-c5e22&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;p&gt;Just a placeholder to mark the ongoing impact of the Internet Archive being attacked (see &lt;a href=&quot;https://blog.archive.org/2024/10/18/internet-archive-services-update-2024-10-17/&quot;&gt;here&lt;/a&gt;, &lt;a href=&quot;https://blog.archive.org/2024/10/21/internet-archive-services-update-2024-10-21/&quot;&gt;here&lt;/a&gt; and &lt;a href=&quot;https://blog.archive.org/2024/10/28/internet-archive-services-update/&quot;&gt;here&lt;/a&gt; for details).&lt;/p&gt;
&lt;p&gt;The impact of this on the &lt;a href=&quot;https://www.biodiversitylibrary.org&quot;&gt;Biodiversity Heritage Library (BHL)&lt;/a&gt; has been huge, and reveals the extent to which BHL depends on the Archive. The Archive is:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;BHL’s long-term archival storage of book scans&lt;/li&gt;
&lt;li&gt;BHL’s processing pipeline for converting images to text&lt;/li&gt;
&lt;li&gt;BHL’s store for additional metadata (e.g., page numbers)&lt;/li&gt;
&lt;li&gt;BHL’s image server (i.e., all the images of scanned books on the BHL website are served from the Archive)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The attack on the Archive has crippled BHL (parts are slowly coming back). I think this is time for a fundamental rethink in how BHL manages its data, its processing pipeline, and how it serves images.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6611271714880105357'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6611271714880105357'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/10/internet-archive-as-single-point-of.html' title='Internet Archive as a single point of failure'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-2571875043654597956</id><published>2024-10-18T12:51:00.004+01:00</published><updated>2024-11-13T13:01:45.017+00:00</updated><title type='text'>Exploring BOLD&#39;s DNA barcode data releases: there&#39;s a fraction too much friction</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Exploring BOLD&#39;s DNA barcode data releases: there&#39;s a fraction too much friction &lt;a href=&quot;https://doi.org/10.59350/6qepn-ge510&quot;&gt;https://doi.org/10.59350/6qepn-ge510&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;p&gt;Recently I’ve been exploring data downloaded from &lt;a href=&quot;https://www.boldsystems.org/&quot;&gt;BOLD&lt;/a&gt;. Part of this was motivated by work done with David Schindel for a recent book:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Schindel, D.E., Page, R.M.P. (2024). Creating Virtuous Cycles for DNA Barcoding: A Case Study in Science Innovation, Entrepreneurship, and Diplomacy. In: DeSalle, R. (eds) DNA Barcoding. Methods in Molecular Biology, vol 2744. Humana, New York, NY. &lt;a href=&quot;https://doi.org/10.1007/978-1-0716-3581-0_1&quot;&gt;doi:10.1007/978-1-0716-3581-0_1&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;In this blog post I record some struggles I’ve had with the supposedly “&lt;a href=&quot;https://frictionlessdata.io&quot;&gt;Frictionless&lt;/a&gt;” data provided by BOLD. I list a serious of issues, and make some recommendations as to how these can be fixed.&lt;/p&gt;
&lt;h2 id=&quot;previous-versions-disappear-from-site&quot;&gt;Previous versions disappear from site&lt;/h2&gt;
&lt;p&gt;The web page &lt;a href=&quot;https://www.boldsystems.org/index.php/datapackages&quot;&gt;Data Packages&lt;/a&gt; lists datasets that can be downloaded.&lt;/p&gt;
&lt;p&gt;The two most recent have the DOIs:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-Latest&quot;&gt;doi:10.5883/DP-Latest&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-Latest&quot;&gt;doi:10.5883/DP-SecondLatest&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;While this makes it easy to link to the latest version of the data, it inhibits reproducibility because the data doi:10.5883/DP-Latest points to can change, and there is (currently) no unique DOI for that particular dataset. Once a version becomes the third oldest, then it seems to get a version-specific DOI.&lt;/p&gt;
&lt;p&gt;The list of “Historical Data” releases is not exhaustive. Currently (2024-10-17) there are four older versions listed:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-BOLD_Public.27-Sep-2024&quot;&gt;http://doi.org/10.5883/DP-BOLD_Public.27-Sep-2024&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-BOLD_Public.19-Jul-2024&quot;&gt;http://doi.org/10.5883/DP-BOLD_Public.19-Jul-2024&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-BOLD_Public.29-Mar-2024&quot;&gt;http://doi.org/10.5883/DP-BOLD_Public.29-Mar-2024&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;http://doi.org/10.5883/DP-BOLD_Public.29-Dec-2023&quot;&gt;http://doi.org/10.5883/DP-BOLD_Public.29-Dec-2023&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;However I have downloaded older versions that are no longer listed on the Data Packages web page. This makes it hard for anyone wanting to trace the history of changes in BOLD data.&lt;/p&gt;
&lt;h3 id=&quot;recommendation&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Have distinct DOIs for latest version (i.e., in addition to “10.5883/DP-Latest”), and keep list of all releases on the web site.&lt;/p&gt;
&lt;p&gt;Even better, switch to using &lt;a href=&quot;https://zenodo.org&quot;&gt;Zenodo&lt;/a&gt; to store data, they provide a nicer model of versioning, and can also provide download metrics.&lt;/p&gt;
&lt;h2 id=&quot;where-are-the-images&quot;&gt;Where are the images?&lt;/h2&gt;
&lt;p&gt;A major surprise is the lack of URLs for specimen images. The imagery in BOLD is very useful, yet not included in the data export! The only way to get a list of image URLs is to get the data from GBIF(!).&lt;/p&gt;
&lt;h3 id=&quot;recommendation-1&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Include images URLs in the data releases.&lt;/p&gt;
&lt;h2 id=&quot;column-names-change-over-time-and-aren’t-standardised&quot;&gt;Column names change over time and aren’t standardised&lt;/h2&gt;
&lt;p&gt;One major source of frustration is that the labels used for the columns of data can change. This is, pun intended, a major source of friction in supposedly “frictionless” data. Code that successfully parses one dataset may fail with the new release. One could argue that the code should rely solely on the information in the data package, but there are some columns (e.g., geographic coordinates, the raw sequences) that require special treatment, and the natural language descriptions of each column are not machine-readable.&lt;/p&gt;
&lt;p&gt;Below is a table of column names from a range of data packages from 2022-09-28 to 2024-09-06. Most column names are stable, but sometimes new ones appear, and sometimes they vanish. Core data elements, such as nucleotide sequences can change, e.g. &lt;code&gt;nuc&lt;/code&gt; versus &lt;code&gt;nucraw&lt;/code&gt;.&lt;/p&gt;

&lt;table&gt;
&lt;tr&gt;&lt;th&gt; column name &lt;/th&gt;&lt;th&gt; 2022-09-28 &lt;/th&gt;&lt;th&gt; 2023-09-29 &lt;/th&gt;&lt;th&gt; 2023-10-27 &lt;/th&gt;&lt;th&gt; 2024-07-26 &lt;/th&gt;&lt;th&gt; 2024-08-02 &lt;/th&gt;&lt;th&gt; 2024-08-09 &lt;/th&gt;&lt;th&gt; 2024-09-06&lt;/th&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; associated_specimen &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; associated_specimens &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; associated_taxa &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; bin_created_date &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; bin_uri &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; biome &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; bold_recordset_code_arr &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; class &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_code &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_date &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_date_accuracy &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_date_end &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_date_start &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_event_id &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_note &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_notes &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collection_time &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; collectors &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; coord &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; coord_accuracy &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; coord_source &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; country &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; country/ocean &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; country_iso &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; depth &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; depth_accuracy &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; ecoregion &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; elev &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; elev_accuracy &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; extrainfo &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; family &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; fieldid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; funding_src &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; gb_acs &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; genus &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; geoid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; habitat &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; identification &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; identification_method &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; identification_rank &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; identified_by &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; identifier_email &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; insdc_acs &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; inst &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; kingdom &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; life_stage &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; marker_code &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; museumid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; notes &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; nuc &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; nuc_basecount &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; nucraw &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; order &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; phylum &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; primers_forward &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; primers_reverse &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; processid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; processid_minted_date &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; province &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; realm &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; record_id &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; recordset_code_arr &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; region &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; reproduction &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sampleid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sampling_protocol &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sector &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sequence_run_site &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sequence_upload_date &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sex &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; short_note &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; site &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; site_code &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; sovereign_inst &lt;/td&gt;&lt;td&gt;  &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; species &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; species_reference &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; specimen_linkout &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; specimenid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; subfamily &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; subspecies &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; taxid &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; taxon_name &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; taxon_rank &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; taxonomy_notes &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; tissue_type &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; tribe &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td&gt; &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td&gt; voucher_type &lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color:rgb(192,192,192);&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;

&lt;h3 id=&quot;recomendation&quot;&gt;Recomendation&lt;/h3&gt;
&lt;p&gt;Avoid changing the names of data columns between releases. Adopt standardised terms, such as &lt;a href=&quot;https://dwc.tdwg.org&quot;&gt;Darwin Core&lt;/a&gt;, wherever possible. Tell us that these are Darwin Core by using the &lt;code&gt;dwc:&lt;/code&gt; prefix.&lt;/p&gt;
&lt;h2 id=&quot;use-identifiers-for-people&quot;&gt;Use identifiers for people&lt;/h2&gt;
&lt;p&gt;People appear in several places in the data, notably as identifiers and collectors. The BOLD data uses simple text strings (i.e., names) of people, rather than external identifiers such as ORCIDs. This means we miss out on valuable information. For example, for a 2 million sequence subset of the latest release I was curious as to who identified the most specimens. For each of these names I then wanted to find out who they were. For example, are they taxonomists? If so, what is their expertise? What taxonomic papers have they published? Where are they based? If BOLD included ORCID ids it would be easier to answer these questions. Instead, I resorted to Google and produced the following table:&lt;/p&gt;
&lt;p&gt;Top 10 identifiers of BOLD specimens:&lt;/p&gt;

&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Name&lt;/th&gt;
&lt;th&gt;Affiliation&lt;/th&gt;
&lt;th&gt;ORCID&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Kate Perez&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0001-5233-1539&quot;&gt;0000-0001-5233-1539&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Angela Telfer&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0003-1846-6362&quot;&gt;0000-0003-1846-6362&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Daniel H. Janzen&lt;/td&gt;
&lt;td&gt;U of Pennsylvania&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0002-7335-5107&quot;&gt;0000-0002-7335-5107&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Valerie Levesque-Beaudin&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0002-6053-0949&quot;&gt;0000-0002-6053-0949&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Gergin A. Blagoev&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0003-1844-0779&quot;&gt;0000-0003-1844-0779&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Renee Miskie&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;-&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;BOLD ID Engine&lt;/td&gt;
&lt;td&gt;-&lt;/td&gt;
&lt;td&gt;-&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Brandon MONG Guo jie&lt;/td&gt;
&lt;td&gt;Academia Sinica, Taipei&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0002-1673-8021&quot;&gt;0000-0002-1673-8021&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Paul D.N. Hebert&lt;/td&gt;
&lt;td&gt;U of Guelph&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0002-3081-6700&quot;&gt;0000-0002-3081-6700&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Brian Fisher&lt;/td&gt;
&lt;td&gt;California Academy of Sciences&lt;/td&gt;
&lt;td&gt;&lt;a href=&quot;https://orcid.org/0000-0002-4653-3270&quot;&gt;0000-0002-4653-3270&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;p&gt;Note that most of the top ten identifiers work at the University of Guelph, the home of BOLD. This tells us something about the degree to which BOLD is dependent on its own staff to identify specimens, versus the extent to which it has engaged the wider community.&lt;/p&gt;
&lt;p&gt;The flip side of this is that these people are curating an important database. Are they getting credit? Is this curation making its way into &lt;a href=&quot;https://bionomia.net&quot;&gt;Bionomia&lt;/a&gt;, which has mechanisms to give credit to this work.&lt;/p&gt;
&lt;p&gt;I have also briefly looked at collector names, and it is - as one might expect - something of a mess. The same person’s name is written different ways, text strings representing multiple people are incorrectly split into individual names, etc. The description in the data package is more wishful thinking than reality:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Comma separated list of full or abbreviated names of the individuals or teams responsible for collecting the sample in the field.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3 id=&quot;recommendation-2&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Add ORCID identifiers for people who have identified specimens.&lt;/p&gt;
&lt;h2 id=&quot;method-of-identification-not-standardised&quot;&gt;Method of identification not standardised&lt;/h2&gt;
&lt;p&gt;The value of BOLD as a tool for identifying new sequences depends on the reliability of existing DNA barcodes. How are these identified? The field &lt;code&gt;identification_method&lt;/code&gt; is full of a mix of terms. There are all the obvious traps people fall into when not being careful with data. The same term may be spelt differently and/or is capitalised differently. People add qualifiers to a term, such as the date of identification, making it much harder to ask simple questions such as how many sequences have been identified based on their morphology, versus  based on their sequences.&lt;/p&gt;
&lt;p&gt;So far I’ve found 1889 different terms for identification method, here are the top 20:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;BIN Taxonomy Match&lt;/li&gt;
&lt;li&gt;BOLD ID Engine Manual&lt;/li&gt;
&lt;li&gt;BOLD Sequence Classifier&lt;/li&gt;
&lt;li&gt;Morphology&lt;/li&gt;
&lt;li&gt;morphology&lt;/li&gt;
&lt;li&gt;morphological&lt;/li&gt;
&lt;li&gt;BOLD ID Engine (March 2015)&lt;/li&gt;
&lt;li&gt;Tree based Identification(April 2016)&lt;/li&gt;
&lt;li&gt;Morphological&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Mar 2023)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (May 2019)&lt;/li&gt;
&lt;li&gt;Tree based Identification (Feb 2017)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (May 2017)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Oct 2022)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Aug 2023)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Mar 2017)&lt;/li&gt;
&lt;li&gt;BOLD ID Engine&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Apr 2017)&lt;/li&gt;
&lt;li&gt;BIN Taxonomy Match (Jun 2019)&lt;/li&gt;
&lt;li&gt;Tree Based Identification (April 2016)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Note that we have “Morphology”, “morphology”, “morphological”, and “Morphological”. How is “BOLD ID Engine Manual” different from “BOLD ID Engine”? Note the use of dates as qualifiers.&lt;/p&gt;
&lt;h3 id=&quot;recommendation-3&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Enforce a standardised vocabulary, add additional fields for date and notes on identification.&lt;/p&gt;
&lt;h2 id=&quot;voucher-type-not-standardised&quot;&gt;Voucher type not standardised&lt;/h2&gt;
&lt;p&gt;The description of the &lt;code&gt;voucher_type&lt;/code&gt; reads:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;“Status of the specimen in an accessioning process.This field uses a controlled vocabulary: ‘Museum Vouchered:Type’, ‘Museum Vouchered:Type Series’, ‘Vouchered:Registered Collection’, ‘To Be Vouchered:Holdup/Private’, ‘E-Vouchered:DNA/Tissue+Photo’, ‘Dna/Tissue Vouchered Only’, ‘No Specimen’.”&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;This is patently false. Instead of seven terms are at least 508 for this field. Here are the top 20 terms (* indicates a term from the controlled vocabulary):&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Vouchered:Registered Collection*&lt;/li&gt;
&lt;li&gt;DNA/Tissue Vouchered Only*&lt;/li&gt;
&lt;li&gt;To Be Vouchered:Holdup/Private*&lt;/li&gt;
&lt;li&gt;museum voucher&lt;/li&gt;
&lt;li&gt;E-Vouchered:DNA/Tissue+Photo*&lt;/li&gt;
&lt;li&gt;Voucher Type: Morphological&lt;/li&gt;
&lt;li&gt;No Specimen*&lt;/li&gt;
&lt;li&gt;Museum voucher, whole specimen in ethanol&lt;/li&gt;
&lt;li&gt;Vouchered:Private Collection&lt;/li&gt;
&lt;li&gt;Museum Vouchered:Type*&lt;/li&gt;
&lt;li&gt;Museum Vouchered:Type Series*&lt;/li&gt;
&lt;li&gt;Museum voucher, Whole specimen in ethanol&lt;/li&gt;
&lt;li&gt;Museum voucher, whole specimen&lt;/li&gt;
&lt;li&gt;Museum Vouchered&lt;/li&gt;
&lt;li&gt;Museum voucher, e-voucher&lt;/li&gt;
&lt;li&gt;Museum voucher, Whole specimen&lt;/li&gt;
&lt;li&gt;Museum voucher, E-vouchered with additional representatives stored in ethanol in parent lot&lt;/li&gt;
&lt;li&gt;vouchered: not registered collection&lt;/li&gt;
&lt;li&gt;Museum voucher, E-vouchered with additional representatives stored in ethanol&lt;/li&gt;
&lt;li&gt;in alcohol (ethanol, 96%)&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&quot;recommendation-4&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Enforce the existing controlled vocabulary.&lt;/p&gt;
&lt;h2 id=&quot;institutions-lack-identifiers&quot;&gt;Institutions lack identifiers&lt;/h2&gt;
&lt;p&gt;Institutions are listed by name. As with any string, there is the potential for different spellings and formatting. Anyone interested in getting metrics for institutional engagement with BOLD, and comparing that to, say, sources of funding, would much rather have identifiers than strings.&lt;/p&gt;
&lt;p&gt;Here are the top 20 institutions:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Centre for Biodiversity Genomics&lt;/li&gt;
&lt;li&gt;University of Pennsylvania&lt;/li&gt;
&lt;li&gt;Area de Conservacion Guanacaste&lt;/li&gt;
&lt;li&gt;Mined from GenBank, NCBI&lt;/li&gt;
&lt;li&gt;Canadian National Collection of Insects, Arachnids and Nematodes&lt;/li&gt;
&lt;li&gt;SNSB, Zoologische Staatssammlung Muenchen&lt;/li&gt;
&lt;li&gt;Australian National Insect Collection&lt;/li&gt;
&lt;li&gt;University of Malaya, Museum of Zoology&lt;/li&gt;
&lt;li&gt;Instituto Nacional de Biodiversidad, Costa Rica&lt;/li&gt;
&lt;li&gt;Royal Ontario Museum&lt;/li&gt;
&lt;li&gt;California Academy of Sciences&lt;/li&gt;
&lt;li&gt;NEON Biorepository at Arizona State University&lt;/li&gt;
&lt;li&gt;Research Collection of M. Alex Smith&lt;/li&gt;
&lt;li&gt;Smithsonian Tropical Research Institute&lt;/li&gt;
&lt;li&gt;University of New Brunswick, Fredericton&lt;/li&gt;
&lt;li&gt;York University, Packer Collection&lt;/li&gt;
&lt;li&gt;Wellcome Sanger Institute&lt;/li&gt;
&lt;li&gt;Smithsonian Institution, National Museum of Natural History&lt;/li&gt;
&lt;li&gt;Natural History Museum, London&lt;/li&gt;
&lt;li&gt;University of Oulu, Zoological Museum&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&quot;recommendation-5&quot;&gt;Recommendation&lt;/h3&gt;
&lt;p&gt;Add external identifiers for institutions such as &lt;a href=&quot;https://ror.org&quot;&gt;RORs&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
&lt;p&gt;Some of these issues raised here are easy to fix, other will require a lot of curation. I suspect that part of the problem is that there’s no evidence that BOLD itself makes use of these data dumps. If you view data exports as somethign you are “supposed to” do, rather than something that you yourself use, then there’s no incentive to make sure the data is fit for purpose. &lt;a href=&quot;https://en.wikipedia.org/wiki/Eating_your_own_dog_food&quot;&gt;Eating your own dog food&lt;/a&gt; is a great way to avoid these problems.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2571875043654597956'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2571875043654597956'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/10/exploring-bolds-dna-barcode-data.html' title='Exploring BOLD&#39;s DNA barcode data releases: there&#39;s a fraction too much friction'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-7542793921221421602</id><published>2024-10-08T16:35:00.003+01:00</published><updated>2024-11-13T13:02:51.163+00:00</updated><title type='text'>The Data Citation Corpus revisited</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). The Data Citation Corpus revisited &lt;a href=&quot;https://doi.org/10.59350/wvwva-v7125&quot;&gt;https://doi.org/10.59350/wvwva-v7125&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;p&gt;TL;DR&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;The Data Citation Corpus is still riddled with errors, and it is unclear to what extent it measures citation (resuse of data) versus publication (are most citations between the data and the original publication)?&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;These are some brief notes on the latest version (v. 2) of the Data Citation Corpus, relased shortly before the &lt;a href=&quot;https://makedatacount.org/read-our-blog/evaluation-context-and-getting-practical-on-data-metrics-the-make-data-count-summit-2024/&quot;&gt;Make Data Count Summit 2024&lt;/a&gt;, which also included a &lt;a href=&quot;https://makedatacount.org/read-our-blog/the-data-citation-corpus-in-practice-perspectives-from-a-funder-and-an-institution/&quot;&gt;discussion on the practical uses of the corpus&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;I downloaded version 2 from Zenodo &lt;a href=&quot;https://doi.org/10.5281/zenodo.13376773&quot;&gt;doi:10.5281/zenodo.13376773&lt;/a&gt;.  The data is in JSON format, which I then loaded into CouchDB to play with. Loading the data was relatively quick using CouchDB’s “bulk upload” feature, although building indexes to explore the data takes a little while.&lt;/p&gt;
&lt;p&gt;What follows is a series of charts constructed using &lt;a href=&quot;https://vega.github.io/vega-lite/&quot;&gt;Vega-Lite&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&quot;the-top-20-repositories&quot;&gt;The top 20 repositories&lt;/h3&gt;
&lt;p&gt;&lt;img src=&quot;https://raw.githubusercontent.com/rdmpage/data-citation-corpus/refs/heads/main/charts/repositories.png&quot; alt=&quot;Top 20 repositories in Data Citation Corpus&quot;&gt;&lt;/p&gt;
&lt;p&gt;The chart above shows the top 20 repositories by number of citations. The biggest repository by some distance is the &lt;a href=&quot;https://www.ebi.ac.uk/ena/browser/home&quot;&gt;European Nucleotide Archive&lt;/a&gt;, so most of the data being cited are DNA sequences. Those working on biodiversity might be plessed to see that &lt;a href=&quot;https://www.gbif.org&quot;&gt;GBIF&lt;/a&gt; is 16th out of 20.&lt;/p&gt;
&lt;p&gt;Note that &lt;a href=&quot;https://figshare.com&quot;&gt;Figshare&lt;/a&gt; appears twice, as “Figshare” and “figshare”, so there are problems with data cleaning. It’s actually worse than this, because the repository “Taylor &amp;amp; Francis” is an branded instance of Figshare: &lt;a href=&quot;https://tandf.figshare.com&quot;&gt;https://tandf.figshare.com&lt;/a&gt;. So if we want to measure the impact of the Figshare repository we will need to cluster the different spellings, as well as check the DOIs for each repository (T&amp;amp;F data DOIs still carry Figshare branding, e.g.&lt;a href=&quot;https://doi.org/10.6084/m9.figshare.14103264&quot;&gt;doi:10.6084/m9.figshare.14103264&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;The vast majority of data in the “Taylor &amp;amp; Francis” is published by &lt;a href=&quot;https://en.wikipedia.org/wiki/Informa&quot;&gt;Informa UK Ltd&lt;/a&gt; which is the parent company of Taylor &amp;amp; Francis. If you visit an article in an Informa journal, such as &lt;a href=&quot;https://doi.org/10.1080/00032719.2021.1888966&quot;&gt;Enhanced Selectivity of Ultraviolet-Visible Absorption Spectroscopy with Trilinear Decomposition on Spectral pH Measurements for the Interference-Free Determination of Rutin and Isorhamnetin in Chinese Herbal Medicine&lt;/a&gt; the supplementary information is stored in Figshare. These links between the publication and the supplementary information are treated as “citations” in the corpus. Is this what we mean by citation? If so, we are not measuring the use of data, but rather its publication.&lt;/p&gt;
&lt;h3 id=&quot;top-20-publishers&quot;&gt;Top 20 publishers&lt;/h3&gt;
&lt;p&gt;&lt;img src=&quot;https://raw.githubusercontent.com/rdmpage/data-citation-corpus/refs/heads/main/charts/publishers.png&quot; alt=&quot;Top 20 publishers in Data Citation Corpus&quot;&gt;&lt;/p&gt;
&lt;p&gt;The top 20 publishers of articles that cite data in the corpus are shown above. It would be interesting to know how much this reflects publication policies of these publishers (e.g., open versus closed access, indexing in Pubmed, availability of XML, etc.) versus actual citation of data. Biodiversity people might be pleased to see &lt;a href=&quot;https://pensoft.net&quot;&gt;Pensoft&lt;/a&gt; appearing in the top 20.&lt;/p&gt;
&lt;h3 id=&quot;gbif&quot;&gt;GBIF&lt;/h3&gt;
&lt;p&gt;You can also explore the data by individual repository. For example, the top 20 publishers of articles citing data in GBIF shows Pensoft at number one. This reflects the subject matter of Pensoft journals, and Pensoft’s focus on best practices for publishing data. From GBIF’s perspective, perhaps that organisation would want to extend their reach beyond core biodiversity journals.&lt;/p&gt;
&lt;p&gt;&lt;img src=&quot;https://raw.githubusercontent.com/rdmpage/data-citation-corpus/refs/heads/main/charts/gbif-publishers.png&quot; alt=&quot;Top 20 publishers citing data from GBIF&quot;&gt;&lt;/p&gt;
&lt;h3 id=&quot;citation&quot;&gt;Citation&lt;/h3&gt;
&lt;p&gt;The vast majority of data in the Data Citation Corpus is cited only once.&lt;/p&gt;
&lt;p&gt;&lt;img src=&quot;https://raw.githubusercontent.com/rdmpage/data-citation-corpus/refs/heads/main/charts/cited.png&quot; alt=&quot;Frequency of citation&quot;&gt;&lt;/p&gt;
&lt;p&gt;Given that much of these “citations” may be by the publication that makes the data available, it’s not clear to me that the corpus is actually measuring citation (i.e., reuse of the data). Instead it may just be measuring publication (e.g., the link betwene a paper and its supplementary data). To answer this we’d need to drill down into the data more.&lt;/p&gt;
&lt;p&gt;Note that some data items have large numbers of citations, the highest is “LY294002” with 9983 citations, with the next being “A549” with 5883 citations. &lt;a href=&quot;https://en.wikipedia.org/wiki/LY294002&quot;&gt;LY294002&lt;/a&gt; is a chemical compound that acts as an inhibitor, and &lt;a href=&quot;https://en.wikipedia.org/wiki/A549_cell&quot;&gt;A549&lt;/a&gt; is cell type. The citation corpus regards both as accession numbers for sequences(!). Hence it’s likely that the most cited data records are not data at all, but false matches to other entities, such as chemicals and cells. These false matches are still a major problem for the corpus.&lt;/p&gt;
&lt;h3 id=&quot;summary&quot;&gt;Summary&lt;/h3&gt;
&lt;p&gt;I think there is so much potential here, but there are significant data quality issues. Anyone basing metrics upon this corpus would need to proceed very carefully.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7542793921221421602'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7542793921221421602'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/10/the-data-citation-corpus-revisited.html' title='The Data Citation Corpus revisited'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-3473989744329234796</id><published>2024-08-13T09:14:00.005+01:00</published><updated>2024-11-13T13:04:21.357+00:00</updated><title type='text'>Why do museum and gallery displays ignore the web?</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Why do museum and gallery displays ignore the web? &lt;a href=&quot;https://doi.org/10.59350/a83tn-c6t14&quot;&gt;https://doi.org/10.59350/a83tn-c6t14&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhcQEc1wJ_Gq2zUJDN5sJ17yZ7jVUFxnY0_zhYefYobQbJyuWgcE3x8z92HSRARz8POClM6hJ7rYPzMoBQ5EeSdDmVBSfWVy8g_C9kDhZ1LeUlImIw8Jz0A89cHuFRk5CCxAtyR1CJYHEU-5f4lBlPi_XYhnitrd_ojGGn0cJ6GMFclVFXEwz4L/s500/ngv.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;128&quot; data-original-height=&quot;337&quot; data-original-width=&quot;500&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhcQEc1wJ_Gq2zUJDN5sJ17yZ7jVUFxnY0_zhYefYobQbJyuWgcE3x8z92HSRARz8POClM6hJ7rYPzMoBQ5EeSdDmVBSfWVy8g_C9kDhZ1LeUlImIw8Jz0A89cHuFRk5CCxAtyR1CJYHEU-5f4lBlPi_XYhnitrd_ojGGn0cJ6GMFclVFXEwz4L/s200/ngv.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;This post is inspired by the &lt;a href=&quot;https://www.ngv.vic.gov.au/exhibition/pharaoh&quot;&gt;Pharaoh exhibition&lt;/a&gt; at the NGV in Melbourne, Australia. This is a beautifully displayed exhibition of objects from the &lt;a href=&quot;https://www.britishmuseum.org/&quot;&gt;British Museum&lt;/a&gt;, London. It has all the trappings of a modern exhibition, beautiful lighting, a custom sound track, and lots of social media coverage. But I found it immensely frustrating to visit.&lt;/p&gt;
&lt;p&gt;The reason for my frustration is the missed opportunity to provide visitors with the means to learn more from each object than a few cursory sentences on a display card. Take, for example, the “Lintel of King Amenemhat III”, for which we learn:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;This lintel was originally placed in a temple erected by Amenemhat III. The carving reflects the harmonious symmetry followed inside an Egyptian temple. At the centre is a cartouche enclosing the king’s birth name. This is surrounded by inscriptions that radiate from the centre to the sides of the lintel. Names of the king face references to Sobek, the god of the temple, who is depicted as a crocodile seated on a shrine.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh19cgCrffwBRrFs7yhRIRv_MVaTR2Y1dewDInlte6uBKNIw1aE_el207wEZmtzRE4rmunZrvgE0xUN0TpEpUy38k156chlakK_SKG3D1ORt7OhJz8eYzycZDcNUAFvOVQ8ot42n43eGBHugMZvwqiZgvjkP2hX0X5G7CFBSKouR-biAcM6jU3y/s5712/IMG_6088.HEIC&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;5712&quot; data-original-width=&quot;4284&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh19cgCrffwBRrFs7yhRIRv_MVaTR2Y1dewDInlte6uBKNIw1aE_el207wEZmtzRE4rmunZrvgE0xUN0TpEpUy38k156chlakK_SKG3D1ORt7OhJz8eYzycZDcNUAFvOVQ8ot42n43eGBHugMZvwqiZgvjkP2hX0X5G7CFBSKouR-biAcM6jU3y/s320/IMG_6088.HEIC&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEggTg2U3j7VzhTwrS5NP5aNijFnqKfYX25u-9ZCB41ASPk3p5p1xSo95dq-FAm28vVfBOL2VSPGZszsYjjSKMWmSH-XrYM5d9k9ZAv7-FgggO0EKoY2lkUyC5SvY0ro5-cIz8HIg2IxbV7ySCnSlsuuavzywx_Uaa5dtJ5pmZDIsVUABC4msBXr/s5712/IMG_6087.HEIC&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;4284&quot; data-original-width=&quot;5712&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEggTg2U3j7VzhTwrS5NP5aNijFnqKfYX25u-9ZCB41ASPk3p5p1xSo95dq-FAm28vVfBOL2VSPGZszsYjjSKMWmSH-XrYM5d9k9ZAv7-FgggO0EKoY2lkUyC5SvY0ro5-cIz8HIg2IxbV7ySCnSlsuuavzywx_Uaa5dtJ5pmZDIsVUABC4msBXr/s320/IMG_6087.HEIC&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;That is all that we are told. Yet on this display card is a cryptic code “EA1072”, which to most visitors is likely to be no less obscure than the hieroglyphs on the object itself. EA1072 is the number of this object in the British Museum collection. Each code can be converted into a URL by appending it to &lt;code&gt;https://www.britishmuseum.org/collection/object/Y_&lt;/code&gt;, i.e., &lt;a href=&quot;https://www.britishmuseum.org/collection/object/Y_EA1072&quot;&gt;https://www.britishmuseum.org/collection/object/Y_EA1072&lt;/a&gt;. If we click on that URL we get a wealth of additional information, including a more detailed description, a bibliography, even an explanatory &lt;a href=&quot;https://youtu.be/LwZB0MsXCjQ&quot;&gt;YouTube video&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;So, for each object it would have been trivial for the NGV to include a QR code that would take the visitor to the British Museum’s web site to discover more information about that object, to put the object in context, and learn more about both the object and those who discovered and interpreted it. I’m guessing that most, if not all visitors, had a mobile phone that could read the code and access the internet.&lt;/p&gt;
&lt;p&gt;If taking the visitor to the British Museum rather than the NGV’s web site is a problem, why not do the smart thing and reuse the BM’s codes as “slugs” on the end of an NGV URL (much as the BBC did with Wikipedia, see &lt;a href=&quot;https://iphylo.blogspot.com/2010/02/eol-bbc-and-wikipedia.html&quot;&gt;EOL, the BBC, and Wikipedia&lt;/a&gt;)? Even better, get the underlying data (does the BM have an API, or a machine-readable version of their web pages) and provide the same information in multiple languages. Melbourne is a modern multicultural city, here was a chance to engage with visitors in languages other than English.&lt;/p&gt;
&lt;p&gt;This exhibition seems like an ideal case for the use of persistent identifiers for museums and other collections, something projects such as &lt;a href=&quot;https://tanc-ahrc.github.io/HeritagePIDs/&quot;&gt;Towards a National Collection - HeritagePIDs&lt;/a&gt; was working towards (see also &lt;a href=&quot;https://iphylo.blogspot.com/2020/07/persistent-identifiers-demo-and-rant.html&quot;&gt;Persistent Identifiers: A demo and a rant&lt;/a&gt;). If we have persistent identifiers, especially if they resolve to machine-readable data, it becomes easy to convert static text into entry points to a much larger digital world of knowledge. Instead we seem happy to give simple snippets of information in one language, and hope the viewer’s interest hasn’t faded away by the time they exit via the gift shop.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/3473989744329234796'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/3473989744329234796'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/08/why-do-museuym-and-gallery-displays.html' title='Why do museum and gallery displays ignore the web?'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhcQEc1wJ_Gq2zUJDN5sJ17yZ7jVUFxnY0_zhYefYobQbJyuWgcE3x8z92HSRARz8POClM6hJ7rYPzMoBQ5EeSdDmVBSfWVy8g_C9kDhZ1LeUlImIw8Jz0A89cHuFRk5CCxAtyR1CJYHEU-5f4lBlPi_XYhnitrd_ojGGn0cJ6GMFclVFXEwz4L/s72-c/ngv.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-3247607667651377254</id><published>2024-07-02T11:30:00.004+01:00</published><updated>2024-11-13T13:04:52.843+00:00</updated><title type='text'>A future for the Biodiversity Heritage Library</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). A future for the Biodiversity Heritage Library &lt;a href=&quot;https://doi.org/10.59350/n3dkt-6xd05&quot;&gt;https://doi.org/10.59350/n3dkt-6xd05&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;p&gt;Following the &lt;a href=&quot;https://blog.biodiversitylibrary.org/2024/06/2024-bhl-annual-meeting-securing-our-future-while-celebrating-our-past.html&quot;&gt;2024 BHL meeting&lt;/a&gt;, and the &lt;a href=&quot;https://blog.biodiversitylibrary.org/2024/06/farewell-from-bhl-program-director-martin-r-kalfatovic.html&quot;&gt;departure of Martin Kalfatovic&lt;/a&gt; and the uncertainty the departure of such a pivitol person brings, perhaps it’s time to think about the future of BHL. Below I sketch some thoughts, which are hazy at best. I should say at the outset that I think BHL is an extraordinary project. My goal is to think about ways to enhance its utility and impact.&lt;/p&gt;
&lt;h2 id=&quot;three-facets&quot;&gt;Three facets&lt;/h2&gt;
&lt;p&gt;I think BHL, in common with other projects such as GBIF, has three main facets: providers, users, and developers. These communities have different needs, and what works for one community need not work for the others.&lt;/p&gt;
&lt;h3 id=&quot;providers&quot;&gt;Providers&lt;/h3&gt;
&lt;p&gt;Any project that mobilises data depends on people and organisations that have that data being willing to share it. That community needs a rationale for sharing, tools to share, and a means to demonstrate the value of sharing. The few BHL meetings I’ve been to have been dominated by libraries (it is a library project, after all). BHL meetings typically feature a tour of physical libraries where we gaze at ancient books, many of which are now accessible via the BHL website. There is value in being a member of a club that shares similar goals (making biodiversity literature accessible to a wider audience). From my perspective, a lot of BHL effort and infrastructure is focussed on libraries and library-related tasks. This is natural given its origins, but this means other aspects have been neglected.&lt;/p&gt;
&lt;h3 id=&quot;users-readers-and-more&quot;&gt;Users (readers and more)&lt;/h3&gt;
&lt;p&gt;BHL users are likely diverse, and range from people like me who want the “hard core” technical literature (e.g., species descriptions) to people who revel in the wealth of imagery available in BHL (AKA “the pretty”) (see the &lt;a href=&quot;https://www.flickr.com/photos/61021753@N02/&quot;&gt;BHL Flickr pages&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;The current BHL portal provides a way for people to browse the scanned content, but feels designed primarily for librarians. It is organised by title and scanned volumes, hence it is driven by bibliographic metadata. For a long time, it didn’t support the notion of an “article”, which is why I ended up building &lt;a href=&quot;https://biostor.org&quot;&gt;BioStor&lt;/a&gt; to extract and display individual articles (the unit most academics work with). BHL is now actively adding articles and minting DOIs for articles, which helps embed its content in the wider scholarly landscape. To date these new DOI have been cited &lt;a href=&quot;https://github.com/rdmpage/bhl-piwg-stats&quot;&gt;56,000 times&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;But the current BHL interface is not ideal for viewing articles. We need something simpler and cleaner, and more like the experience offered by modern journal websites.&lt;/p&gt;
&lt;h3 id=&quot;developers-and-data-wranglers&quot;&gt;Developers and data wranglers&lt;/h3&gt;
&lt;p&gt;I’m lumping developers and data wranglers together, even though these people may have different goals, they share the desire to get past the web interface to the underlying data. BHL has some great APIs that I and others make extensive use of. But this is different from providing a clean interface to the data. BHL has a wealth of information linked to taxonomic names, people, places, and more. Taxonomic indexing by &lt;a href=&quot;http://globalnames.org&quot;&gt;Global Names&lt;/a&gt; has made BHL content much more findable, but there is huge scope for indexing on other features. For example, BioStor extracts latitude and longitude pairs from BHL text. These are shown on the map below, indicating the scope for geographic search in BHL.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTTT2V2QTlGpF4BZb-VBRCTsY87E9r6zdjToTjeyZHkcDVKB56zDA5aKEw2VQ5_LD2w97HvkRCBlhTHVEPVV0EQop0LjgSYs3rauO9FoZrIP0cnDJECSVhbniEoNAyHCFPqxLdpKyNpPdxy0LC3H8MZo9R8iITtdBM1tQGNdkYbg4mV5_lZLKz/s1954/map.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;722&quot; data-original-width=&quot;1954&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTTT2V2QTlGpF4BZb-VBRCTsY87E9r6zdjToTjeyZHkcDVKB56zDA5aKEw2VQ5_LD2w97HvkRCBlhTHVEPVV0EQop0LjgSYs3rauO9FoZrIP0cnDJECSVhbniEoNAyHCFPqxLdpKyNpPdxy0LC3H8MZo9R8iITtdBM1tQGNdkYbg4mV5_lZLKz/s400/map.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;h2 id=&quot;whats-next&quot;&gt;What’s next?&lt;/h2&gt;
&lt;p&gt;I think there’s a case to be made to provide three separate interfaces to BHL.&lt;/p&gt;
&lt;p&gt;The first would be for the providers (e.g., libraries), which includes all the behind the scenes infrastructure to do with cataloging, etc., and would also include the current portal. The existing BHL interface is important both to show the complete corpus, and also as a place for serendipitous discovery.&lt;/p&gt;
&lt;p&gt;The second interface would be for readers. The obvious candidate here is &lt;a href=&quot;https://pkp.sfu.ca/software/ojs/&quot;&gt;Open Journal Systems (OJS)&lt;/a&gt; which powers many journal sites, including &lt;em&gt;Zootaxa&lt;/em&gt;, by far the largest taxonomic journal. Indeed I would argue that BHL should adopt OJS and offer it as a service to existing biodiversity journals that may be struggling to manage their existing publishing. Taxonomic publishing has a very long tail of small journals, as the figure below shows (taken from &lt;a href=&quot;https://doi.org/10.1098/rstb.2015.0334&quot;&gt;DNA barcoding and taxonomy: dark taxa and dark texts&lt;/a&gt;).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjzBlZV1UrygjoERDQKVH5hF9cwK08cveH119ZjVySC55WOXFnfxO2R-TOrCC-XCgzdIJXxNfh8B__QjzKapXkohls5F2q1QNPnR9ubTpocEZlRAZfbRyYyF9MI2hfO7ZGupgV7yLPMEFC4oIgIeJ_QtfXnirL8RYUc19T6gt3anxTwKiohLZRc/s1280/rstb20150334f03.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1055&quot; data-original-width=&quot;1280&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjzBlZV1UrygjoERDQKVH5hF9cwK08cveH119ZjVySC55WOXFnfxO2R-TOrCC-XCgzdIJXxNfh8B__QjzKapXkohls5F2q1QNPnR9ubTpocEZlRAZfbRyYyF9MI2hfO7ZGupgV7yLPMEFC4oIgIeJ_QtfXnirL8RYUc19T6gt3anxTwKiohLZRc/s400/rstb20150334f03.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This long tail is often hosted on all manner of custom web sites including Word Press blogs, none of which are ideal. There is an opportunity here for BHL to offer hosting as a, for example,  an affordable service, using the same OJS infrastructure it would use to display BHL articles.&lt;/p&gt;
&lt;p&gt;The final interface would be a data portal. The goal here is to enable people to retrieve data in ways that they find useful, for example by taxon, geographic location, etc. In an ideal world this might be a knowledge graph, but the gap between what knowledge graphs promise and what they deliver is still significant. As a first pass, probably the way forward is to define a series of simple data objects in JSON, load these into Elasticsearch and provide an API on top. This is essentially what GBIF does, where the data is in Darwin Core and the queries are searches over that data. This same infrastructure could also power searches over the articles in OJS, so that users could easily find the content they want.&lt;/p&gt;
&lt;p&gt;This is all pretty arm-wavy at this point, but I think BHL needs to be more outwards facing than it currently is, and needs to think how best to serve the biodiversity community (many of which are already huge fans of BHL), as well as think of ways to enhance its long term sustainability.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/3247607667651377254'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/3247607667651377254'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/07/a-future-for-biodiversity-heritage.html' title='A future for the Biodiversity Heritage Library'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTTT2V2QTlGpF4BZb-VBRCTsY87E9r6zdjToTjeyZHkcDVKB56zDA5aKEw2VQ5_LD2w97HvkRCBlhTHVEPVV0EQop0LjgSYs3rauO9FoZrIP0cnDJECSVhbniEoNAyHCFPqxLdpKyNpPdxy0LC3H8MZo9R8iITtdBM1tQGNdkYbg4mV5_lZLKz/s72-c/map.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-6924627449352415649</id><published>2024-06-19T11:15:00.005+01:00</published><updated>2024-11-13T13:07:26.791+00:00</updated><title type='text'>Visualising big trees: a talk at the Systematics Association 2024</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Visualising big trees: a talk at the Systematics Association 2024 &lt;a href=&quot;https://doi.org/10.59350/cf6n4-ch767&quot;&gt;https://doi.org/10.59350/cf6n4-ch767&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjFCWQ3VQTgHkksEesFaQAXUz8C7cotaO9xTo2gl0-F-WcbxV9hd2C-qOWlA1jfCdNXo4mplw9vQ-ikKIvu655oZrmmfFOzCaPWToi1rurRw0VM80HwjWKRmrt1Y4fx1oWoDNrM74cv420EaplsGxVKGwBJrC5A5YxI4QUTbEkmfSel-LwcKWJ7/s1384/Screenshot%202024-06-19%20at%2011.22.38.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;128&quot; data-original-height=&quot;1298&quot; data-original-width=&quot;1384&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjFCWQ3VQTgHkksEesFaQAXUz8C7cotaO9xTo2gl0-F-WcbxV9hd2C-qOWlA1jfCdNXo4mplw9vQ-ikKIvu655oZrmmfFOzCaPWToi1rurRw0VM80HwjWKRmrt1Y4fx1oWoDNrM74cv420EaplsGxVKGwBJrC5A5YxI4QUTbEkmfSel-LwcKWJ7/s320/Screenshot%202024-06-19%20at%2011.22.38.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;This blog post has some notes in support of a talk given to the Systematics Association meeting in Reading June 20th, 2024.&lt;/p&gt;
&lt;h2&gt;Slides&lt;/h2&gt;
&lt;p&gt;&lt;strike&gt;I will post a link to the slides here once I have given the talk.&lt;/strike&gt;&lt;/p&gt;
&lt;p&gt;Page, Roderic (2024). Visualising big trees. figshare. Presentation. &lt;a href=&quot;https://doi.org/10.6084/m9.figshare.26068693.v1&quot;&gt;https://doi.org/10.6084/m9.figshare.26068693.v1&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&quot;example-web-sites&quot;&gt;Example web sites&lt;/h2&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href=&quot;https://www.legumedata.org/phylogeny/explore/&quot;&gt;GBIF Phylogeny Explorer&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://treevis.net&quot;&gt;TreeVis&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://lifemap-ncbi.univ-lyon1.fr&quot;&gt;LifeMap&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://treeoflife.kew.org/tree-of-life&quot;&gt;Kew Tree of Life Explorer&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://taxonium.org/&quot;&gt;Taxonium&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://web.archive.org/web/20220408194615/https://www.uniprot.org/taxonomy/314295&quot;&gt;old Uniprot tree&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://www.uniprot.org/taxonomy/314295&quot;&gt;new Uniprot&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Tree&amp;amp;id=92557&amp;amp;lvl=3&quot;&gt;NCBI taxonomy&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&quot;demos&quot;&gt;Demos&lt;/h2&gt;
&lt;h3&gt;Kew phylogeny&lt;/h3&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dxrMX5-SIqFM0U1B84eLyTzvFRffKEoc4B5LxsUBwUDFeXN3zmOEr3DHmJPr9RXcnk0D0Kdmz19fhc&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;
&lt;h3&gt;NCBI&lt;/h3&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dzUL2YIDqMldqbrBKGYf_93EQZxgHkwl3OL96rWkLZ3lJ_GLnWMscMLnruWG2K1SuZskE8ZMWLIQeo&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;
&lt;h3&gt;Catalogue of Life&lt;/h3&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;iframe allowfullscreen=&#39;allowfullscreen&#39; webkitallowfullscreen=&#39;webkitallowfullscreen&#39; mozallowfullscreen=&#39;mozallowfullscreen&#39; width=&#39;400&#39; height=&#39;322&#39; src=&#39;https://www.blogger.com/video.g?token=AD6v5dwKbDj4Jg7jtgPqrlkSJ8TlC_QaoAd41bMd2JjwwmoXG5T4b_zLFVdb-FxgfmY7mkL6BiGjRXLYu04&#39; class=&#39;b-hbp-video b-uploaded&#39; frameborder=&#39;0&#39;&gt;&lt;/iframe&gt;&lt;/div&gt;
&lt;h2 id=&quot;background-reading&quot;&gt;Background reading&lt;/h2&gt;
&lt;ul&gt;
&lt;li&gt;Page, R. (2012) Space, time, form: viewing the Tree of Life &lt;a href=&quot;https://doi.org/10.1016/j.tree.2011.12.002&quot;&gt;https://doi.org/10.1016/j.tree.2011.12.002&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Page, R. (2023). The problem with GBIF’s Phylogeny Explorer. &lt;a href=&quot;https://doi.org/10.59350/v0bt3-zp114&quot;&gt;https://doi.org/10.59350/v0bt3-zp114&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;PhyloGeo Tool: interactively exploring large phylogenies in an epidemiological context &lt;a href=&quot;https://doi.org/10.1093/bioinformatics/btx535&quot;&gt;https://doi.org/10.1093/bioinformatics/btx535&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;An Adaptive Resolution Tree Visualization of Large Influenza Virus Sequence Datasets &lt;a href=&quot;https://doi.org/10.1007/978-3-540-72031-7_18&quot;&gt;https://doi.org/10.1007/978-3-540-72031-7_18&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Constructing Overview + Detail Dendrogram-Matrix Views &lt;a href=&quot;https://doi.org/10.1109/TVCG.2009.130&quot;&gt;https://doi.org/10.1109/TVCG.2009.130&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Expand-Ahead: A Space-Filling Strategy for Browsing Trees &lt;a href=&quot;https://doi.org/10.1109/INFOVIS.2004.21&quot;&gt;https://doi.org/10.1109/INFOVIS.2004.21&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Maximum Entropy Summary Trees &lt;a href=&quot;https://doi.org/10.1111/cgf.12094&quot;&gt;https://doi.org/10.1111/cgf.12094&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6924627449352415649'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/6924627449352415649'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/06/visualising-big-trees-talk-at.html' title='Visualising big trees: a talk at the Systematics Association 2024'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjFCWQ3VQTgHkksEesFaQAXUz8C7cotaO9xTo2gl0-F-WcbxV9hd2C-qOWlA1jfCdNXo4mplw9vQ-ikKIvu655oZrmmfFOzCaPWToi1rurRw0VM80HwjWKRmrt1Y4fx1oWoDNrM74cv420EaplsGxVKGwBJrC5A5YxI4QUTbEkmfSel-LwcKWJ7/s72-c/Screenshot%202024-06-19%20at%2011.22.38.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-7343967609515363669</id><published>2024-06-18T18:49:00.008+01:00</published><updated>2024-11-13T13:06:31.347+00:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="FAIR"/><category scheme="http://www.blogger.com/atom/ns#" term="identifiers"/><category scheme="http://www.blogger.com/atom/ns#" term="nanopublication"/><category scheme="http://www.blogger.com/atom/ns#" term="Pensoft"/><category scheme="http://www.blogger.com/atom/ns#" term="RDF"/><category scheme="http://www.blogger.com/atom/ns#" term="synonyms"/><title type='text'>Nanopubs, a way to create even more silos</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Nanopubs, a way to create even more silos &lt;a href=&quot;https://doi.org/10.59350/6nj85-7te92&quot;&gt;https://doi.org/10.59350/6nj85-7te92&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;Pensoft have recently introduced “nanopubs”, small structured publications that can be thought of as containing the minimum possible statement that could be published.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Nanopublications are the smallest units of publishable information: a scientifically meaningful assertion about anything that can be uniquely identified and attributed to its author and serve to communicate a single statement, its original source (provenance) and citation record (publication info). Nanopublications are fully expressed in a way that is both human-readable and machine-interpretable. For more, see &lt;a href=&quot;https://nanopub.net&quot;&gt;https://nanopub.net&lt;/a&gt;, Pensoft blog, this video and on our website. &lt;a href=&quot;https://bdj.pensoft.net/nanopublications&quot;&gt;Nanopublications&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Nanopubs are promoted as &lt;a href=&quot;https://en.wikipedia.org/wiki/FAIR_data&quot;&gt;FAIR&lt;/a&gt;, that is findable, accessible, interoperabile, and reusable. I like the idea of nanopubs, but the examples I have seen so far are problematic. As an aside, there are reasons not to be optimistic about nanopubs (or text-mining in general), see &lt;a href=&quot;https://iphylo.blogspot.com/2021/12/the-business-of-extracting-knowledge.html&quot;&gt;The Business of Extracting Knowledge from Academic Publications&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;I’m going to focus on one nanopub &lt;a href=&quot;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&quot;&gt;RAXCvEZfCc&lt;/a&gt;, which comes from the paper &lt;a href=&quot;https://doi.org/10.3897/BDJ.12.e120304&quot;&gt;Towards computable taxonomic knowledge: Leveraging nanopublications for sharing new synonyms in the Madagascan genus Helictopleurus (Coleoptera, Scarabaeinae)&lt;/a&gt;. This nanopub says that &lt;em&gt;Helictopleurus dorbignyi&lt;/em&gt; Montreuil, 2005 is a subjective synonym of &lt;em&gt;Helictopleurus halffteri&lt;/em&gt; Balthasar, 1964.&lt;/p&gt;
&lt;p&gt;In other words,&lt;/p&gt;
&lt;!--
&lt;p&gt;graph&lt;br&gt;
N1(Helictopleurus dorbignyi) --&amp;gt; isSynonymOf --&amp;gt; N2(Helictopleurus halffteri)&lt;/p&gt;
--&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjc9ARdUyAnTXGumntk-B43H4KFK2uW1THsSEYCywNMk9a4q-3ReyAWAyJFr1NS99d_Fq8MI7bcxzWfmUBFgQM9lZLtU6Te-EsLtiI0FQXsfHTjpPUHn79U95JL7PFeOnWTzVYmf3AAzpCtBTg7dCrdwEgBWHOT60Y_JjhPILqgkZTgPVDMy0Mq/s1188/g3.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;100&quot; data-original-width=&quot;1188&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjc9ARdUyAnTXGumntk-B43H4KFK2uW1THsSEYCywNMk9a4q-3ReyAWAyJFr1NS99d_Fq8MI7bcxzWfmUBFgQM9lZLtU6Te-EsLtiI0FQXsfHTjpPUHn79U95JL7PFeOnWTzVYmf3AAzpCtBTg7dCrdwEgBWHOT60Y_JjhPILqgkZTgPVDMy0Mq/s320/g3.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This seems a fairly simple thing to say, indeed we could say it with a single triple, but the corresponding nanopub requires 33 RDF triples to say this.&lt;/p&gt;
&lt;div style=&quot;font-family:monospace;white-space:pre-wrap;height:200px;overflow-y:auto;&quot;&gt;&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.nanopub.org/nschema#hasAssertion&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#Head&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.nanopub.org/nschema#hasProvenance&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#provenance&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#Head&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.nanopub.org/nschema#hasPublicationInfo&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#Head&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#type&amp;gt; &amp;lt;http://www.nanopub.org/nschema#Nanopublication&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#Head&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#type&amp;gt; &amp;lt;https://w3id.org/biolink/vocab/OrganismTaxonToOrganismTaxonAssociation&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;http://www.w3.org/2000/01/rdf-schema#comment&amp;gt; &quot;Subjective synonymy based on morphological comparison of the type specimens of the two species names&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;https://w3id.org/biolink/vocab/object&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#objtaxon&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;https://w3id.org/biolink/vocab/predicate&amp;gt; &amp;lt;http://purl.obolibrary.org/obo/NOMEN_0000285&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;https://w3id.org/biolink/vocab/subject&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#subjtaxon&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#objtaxon&amp;gt; &amp;lt;https://w3id.org/kpxl/biodiv/terms/hasTaxonName&amp;gt; &amp;lt;https://www.checklistbank.org/dataset/9880/taxon/3K9T4&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#subjtaxon&amp;gt; &amp;lt;https://w3id.org/kpxl/biodiv/terms/hasTaxonName&amp;gt; &amp;lt;https://www.checklistbank.org/dataset/9880/taxon/3K9ST&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; &amp;lt;http://rs.tdwg.org/dwc/terms/basisOfRecord&amp;gt; &amp;lt;http://rs.tdwg.org/dwc/terms/PreservedSpecimen&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#provenance&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; &amp;lt;http://www.w3.org/ns/prov#wasAttributedTo&amp;gt; &amp;lt;https://orcid.org/0000-0002-1938-6105&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#provenance&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#assertion&amp;gt; &amp;lt;http://www.w3.org/ns/prov#wasDerivedFrom&amp;gt; &amp;lt;https://arpha.pensoft.net/preview.php?document_id=22521&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#provenance&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#sig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasAlgorithm&amp;gt; &quot;RSA&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#sig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasPublicKey&amp;gt; &quot;MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCnFtZQdjMpPH4duOBwDybRdPo93QCanFGN8cnpyHqZRQ+FINXypUYCNRSx3VBaWZoLVB/CYCoMY0or/oxBQwl5N7Y/8Ebj+G9ZSNsSkM9uo2DL91f26Y1y2UDE7bnajG909kXQnJS1G59cqIaKyLInjMFD5vWnptysj/ljBv3NTwIDAQAB&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#sig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasSignature&amp;gt; &quot;YzTUmwGRmqHiJVyU1A6rPI1bHbAJPS+Zw6hnDPWzZ9a/7TP+yM/HAf5E9BTS3HNKaCgLAHSnsRg5Q0lPauYQyJd9tbLzR6VU/WJv399Z7/qrn4EhgCULkIhrCAkuWzRtSyHMEbuzyu51ZSQCCPgMZ3HwpVtRa+gVDgqu3nsi5x4=&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#sig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasSignatureTarget&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/dc/terms/created&amp;gt; &quot;2023-12-24T06:24:14.480Z&quot;^^&amp;lt;http://www.w3.org/2001/XMLSchema#dateTime&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/dc/terms/creator&amp;gt; &amp;lt;https://orcid.org/0000-0002-1938-6105&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/dc/terms/license&amp;gt; &amp;lt;https://creativecommons.org/licenses/by/4.0/&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasNanopubType&amp;gt; &amp;lt;http://purl.obolibrary.org/obo/NOMEN_0000017&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/nanopub/x/hasNanopubType&amp;gt; &amp;lt;https://w3id.org/kpxl/biodiv/terms/BiodivNanopub&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://purl.org/nanopub/x/introduces&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#association&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#type&amp;gt; &amp;lt;https://w3id.org/kpxl/biodiv/terms/BiodivNanopub&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;http://www.w3.org/2000/01/rdf-schema#label&amp;gt; &quot;Helictopleurus dorbignyi Montreuil, 2005 (species) - ICZN subjective synonym - Helictopleurus halffteri Balthasar, 1964 (species)&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/wasCreatedFromProvenanceTemplate&amp;gt; &amp;lt;http://purl.org/np/RAYfEAP8KAu9qhBkCtyq_hshOvTAJOcdfIvGhiGwUqB-M&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/wasCreatedFromPubinfoTemplate&amp;gt; &amp;lt;http://purl.org/np/RAA2MfqdBCzmz9yVWjKLXNbyfBNcwsMmOqcNUxkk1maIM&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/wasCreatedFromPubinfoTemplate&amp;gt; &amp;lt;http://purl.org/np/RAR40PzxS9rmUC2lH2ct7IlYhyEib-3GXY5DkuR8wgHRw&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/wasCreatedFromPubinfoTemplate&amp;gt; &amp;lt;http://purl.org/np/RAh1gm83JiG5M6kDxXhaYT1l49nCzyrckMvTzcPn-iv90&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/wasCreatedFromTemplate&amp;gt; &amp;lt;http://purl.org/np/RAf9CyiP5zzCWN-J0Ts5k7IrZY52CagaIwM-zRSBmhrC8&amp;gt; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://www.checklistbank.org/dataset/9880/taxon/3K9ST&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/hasLabelFromApi&amp;gt; &quot;Helictopleurus dorbignyi Montreuil, 2005 (species)&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&amp;lt;https://www.checklistbank.org/dataset/9880/taxon/3K9T4&amp;gt; &amp;lt;https://w3id.org/np/o/ntemplate/hasLabelFromApi&amp;gt; &quot;Helictopleurus halffteri Balthasar, 1964 (species)&quot; &amp;lt;https://w3id.org/np/RAXCvEZfCcjYuH5DWOIujBehGQt61y_nRHWssw9u6aYig#pubinfo&amp;gt; .
&lt;/div&gt;
&lt;p&gt;In part this is because it includes cryptographic signing, presumably to ensure that the statement is what you think it is. There is also a plethora of information about how the nanopublication was derived. Presumably, this is to satisfy reproducibility concerns. But none of this matters if you are producing data that people can’t easily use.&lt;/p&gt;
&lt;p&gt;The core statement looks like this:&lt;/p&gt;
&lt;!--
&lt;p&gt;graph LR&lt;br&gt;
A(#association) --&amp;gt; PS(biolink:subject) --&amp;gt; NS(#subjtaxon)&lt;br&gt;
A(#association) --&amp;gt; PP(biolink:predicate) --&amp;gt; NN(obo:NOMEN_0000285)&lt;br&gt;
A(#association) --&amp;gt; PO(biolink:object) --&amp;gt; NO(#objtaxon)&lt;br&gt;
NS(#subjtaxon) --&amp;gt; PN1(biodiv:hasTaxonName) --&amp;gt; 3K9ST&lt;br&gt;
NO(#objtaxon) --&amp;gt; PN2(biodiv:hasTaxonName) --&amp;gt; 3K9T4&lt;/p&gt;
--&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjy3iM9XD_mlaoqsKSmsvsLbDJ74BjcmsMLpDQSY5vInosFUcws3zmoxcIs_Ofar7lJPVIiuQ8m-DbhIPxRzyd3JEhLhMUUe6hm_oedQCrVxZ7Il30w0sW9TEql9uDj_50W_7rKuWYGF3LGMDf11Fv4mqffpI3bdmV-58idLWy2QoVWYYyVJdYo/s1702/g2.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;436&quot; data-original-width=&quot;1702&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjy3iM9XD_mlaoqsKSmsvsLbDJ74BjcmsMLpDQSY5vInosFUcws3zmoxcIs_Ofar7lJPVIiuQ8m-DbhIPxRzyd3JEhLhMUUe6hm_oedQCrVxZ7Il30w0sW9TEql9uDj_50W_7rKuWYGF3LGMDf11Fv4mqffpI3bdmV-58idLWy2QoVWYYyVJdYo/s320/g2.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;This graph is saying that there is a triple&lt;/p&gt;
&lt;!--
&lt;p&gt;graph LR&lt;br&gt;
NS(#subjtaxon) --&amp;gt; NN(obo:NOMEN_0000285) --&amp;gt; NO(#objtaxon)&lt;/p&gt;
--&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhXYslhOcuMDl77QY6bMc_o9OhzvjLr3E7ap0TscESxeqveNx0Ji-A08BQDRAs2n_C19XpQzRHyhc5bJXktGAY-DswpS2JCg3JwKz3FMO3TxyAtTQBO9rGm2w4XZphTahJVoTqSOHm3s7xeFFt_2_IhBe_L1ZrqIZg8zN9VQQl34cH5J5p_pLE_/s922/g1.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;100&quot; data-original-width=&quot;922&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhXYslhOcuMDl77QY6bMc_o9OhzvjLr3E7ap0TscESxeqveNx0Ji-A08BQDRAs2n_C19XpQzRHyhc5bJXktGAY-DswpS2JCg3JwKz3FMO3TxyAtTQBO9rGm2w4XZphTahJVoTqSOHm3s7xeFFt_2_IhBe_L1ZrqIZg8zN9VQQl34cH5J5p_pLE_/s320/g1.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;By itself this isn’t terribly useful because neither of the two taxa are “things” that have identifiers, they are blank nodes. So, what is the statement about? If we follow the &lt;code&gt;biodiv:hasTaxonName&lt;/code&gt; links, we see that there are names associated with these taxa (&lt;em&gt;Helictopleurus dorbignyi&lt;/em&gt;, and &lt;em&gt;Helictopleurus halffteri&lt;/em&gt;), and these are linked to records in a database in ChecklistBank. This seems complicated, but I assume it is equivalent to saying “in this publication we regard taxa with the names Helictopleurus dorbignyi, and Helictopleurus halffteri to be the same thing”.&lt;/p&gt;
&lt;h2 id=&quot;interoperablity&quot;&gt;Interoperablity&lt;/h2&gt;
&lt;p&gt;I feel that I have been banging this drum for years now, but you &lt;strong&gt;cannot have interoperability unless you use the same identifiers for the same things&lt;/strong&gt;. That means persistent identifiers, identifiers that you have some confidence will be around in ten, 20, or 50 years (at least).&lt;/p&gt;
&lt;p&gt;Leaving aside whatever the persistence of the nanopubs themselves, I find it alarming that the link to the source of the statement that these two names are synonyms is &lt;strong&gt;not&lt;/strong&gt; the DOI for the paper &lt;a href=&quot;https://doi.org/10.3897/BDJ.12.e120304&quot;&gt;10.3897/BDJ.12.e120304&lt;/a&gt;, but a link to the publishing platform ARPHA: &lt;a href=&quot;https://arpha.pensoft.net/preview.php?document_id=22521&quot;&gt;https://arpha.pensoft.net/preview.php?document_id=22521&lt;/a&gt;. This link takes me to a login page, not the actual publication, so I can’t retrieve the source of the statement made in the nanopublication using the nanopublication itself.&lt;/p&gt;
&lt;p&gt;The taxon names have as their identifiers &lt;a href=&quot;https://www.checklistbank.org/dataset/9880/taxon/3K9T4&quot;&gt;https://www.checklistbank.org/dataset/9880/taxon/3K9T4&lt;/a&gt; and &lt;a href=&quot;https://www.checklistbank.org/dataset/9880/taxon/3K9ST&quot;&gt;https://www.checklistbank.org/dataset/9880/taxon/3K9ST&lt;/a&gt;. These identifiers are also local to a particular dataset. Why not use identifiers such as the Catalogue of Life entries for these names (i.e., e.g. &lt;a href=&quot;https://www.catalogueoflife.org/data/taxon/3K9T4&quot;&gt;https://www.catalogueoflife.org/data/taxon/3K9T4&lt;/a&gt;, which supports RDF via embedded JSON-LD) or even LSIDs? We have &lt;a href=&quot;https://lsid.io/urn:lsid:organismnames.com:name:2521540+&quot;&gt;urn:lsid:organismnames.com:name:2521540&lt;/a&gt; for &lt;em&gt;Helictopleurus halffteri&lt;/em&gt; and &lt;a href=&quot;https://lsid.io/urn:lsid:organismnames.com:name:1770738&quot;&gt;urn:lsid:organismnames.com:name:1770738&lt;/a&gt; for &lt;em&gt;Helictopleurus dorbignyi&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;Interestingly, the one well-known external identifier linked to is the ORCID for the author of the nanopub, &lt;a href=&quot;https://orcid.org/0000-0002-1938-6105&quot;&gt;0000-0002-1938-6105)&lt;/a&gt;. I can’t help think that this suggests that authorship of the nanopublication is more important than the fact it publishes.&lt;/p&gt;
&lt;p&gt;One can imagine that nanopublications will be registered with authors’ ORCID profiles, which helps flesh out their online CV. This is nice, but where is the equivalent for linking the publication to the nanopub via its DOI, or the taxon names to the nanopub? How do we know whether these nanopubs contradict other nanopubs, or support them, or add new information? For example, there seems to be no way to go from the DOI for the paper to the nanopub.&lt;/p&gt;
&lt;h2 id=&quot;vocabulary&quot;&gt;Vocabulary&lt;/h2&gt;
&lt;p&gt;Another aspect of interoperability is using the same terms to describe relationships. I’m struck by how many different vocabularies the nanopub requires. Some of these are specific to the administrivia of the nanopub, but others are biological.&lt;/p&gt;
&lt;p&gt;For example, &lt;a href=&quot;http://purl.obolibrary.org/obo/NOMEN_0000285&quot;&gt;http://purl.obolibrary.org/obo/NOMEN_0000285&lt;/a&gt; is used to define the relation between. I confess it’s unclear to me why &lt;code&gt;NOMEN_0000285&lt;/code&gt; isn’t used to directly link the two ChecklistBank records, rather than the indirection via &lt;code&gt;#subjtaxon&lt;/code&gt; and &lt;code&gt;#objtaxon&lt;/code&gt;, given that is a relationship between names (isn’t it?).&lt;/p&gt;
&lt;p&gt;Other ontologies include &lt;a href=&quot;https://biolink.github.io/biolink-model/&quot;&gt;Biolink-Model&lt;/a&gt; and &lt;a href=&quot;https://w3id.org/kpxl/biodiv/terms/&quot;&gt;biodiv&lt;/a&gt; which I can’t seem to find a description of (the URL resolves to queries on the nanodash site). It amazes me how readily people create new ontologies, especially as in the wider world there is a trend towards one vocabuary to rule them all (&lt;a href=&quot;http://schema.org&quot;&gt;schema.org&lt;/a&gt;).&lt;/p&gt;
&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
&lt;p&gt;I find it disheartening that the bulk of the information in a nanopub is administrivia about that nanopub. I understand the desire to establish provenance and to cryptographically sign the information, but all this is of limited use if the actual scientific information is poorly expressed.&lt;/p&gt;
&lt;p&gt;If nanopubs are to be useful I think they need to:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Use persistent identifiers for every entity being referred to, ideally using existing, well-known identifiers. If you are referring to a publication that has a DOI, use that DOI. If you are referring to a taxon or a taxon name, use an appropriate identifier (e.g., an LSID for the name, a URL to a classification).&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Use simple, existing vocabularies wherever possible. Can you model the data using &lt;a href=&quot;https://schema.org&quot;&gt;schema.org&lt;/a&gt; (and extensions such as &lt;a href=&quot;https://bioschemas.org&quot;&gt;Bioschemas&lt;/a&gt;). If not, are you sure you can’t?&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Unless more care is taken, nanopubs will go the way of much of the RDF world, creating new, even more verbose, even more arcane silos of data. This is partly a consequence of the primary incentive, which is to publish minimal units of information. Given that we now have persistent identifiers for people (ORCIDs) and those identifiers are linked to an infrastructure that can automatically register publications linked to ORCIDs, can we expect to see a flood of nanopubs? What vaue will these have if we can’t make ready use of the “facts” they assert? How will people build tools on top of nanopubs if the only thing that reliably links to the external world is the ORCID of the person who created it.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7343967609515363669'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7343967609515363669'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/06/nanopubs-way-to-create-even-more-silos.html' title='Nanopubs, a way to create even more silos'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjc9ARdUyAnTXGumntk-B43H4KFK2uW1THsSEYCywNMk9a4q-3ReyAWAyJFr1NS99d_Fq8MI7bcxzWfmUBFgQM9lZLtU6Te-EsLtiI0FQXsfHTjpPUHn79U95JL7PFeOnWTzVYmf3AAzpCtBTg7dCrdwEgBWHOT60Y_JjhPILqgkZTgPVDMy0Mq/s72-c/g3.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-473359528456433637</id><published>2024-04-19T14:09:00.004+01:00</published><updated>2024-04-24T08:38:39.098+01:00</updated><title type='text'>Notes on transforming BHL images</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Notes on transforming BHL images &lt;a href=&quot;https://doi.org/10.59350/2gpbb-98a53 &quot;&gt;https://doi.org/10.59350/2gpbb-98a53&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;I’ve been down this road before, e.g. &lt;a href=&quot;https://iphylo.blogspot.com/2011/04/bhl-djvu-and-reading-fcking-manual.html&quot;&gt;BHL, DjVu, and reading the f*cking manual&lt;/a&gt; and &lt;a href=&quot;https://iphylo.blogspot.com/2015/08/demo-of-full-text-indexing-of-bhl-using.html&quot;&gt;Demo of full-text indexing of BHL using CouchDB hosted by Cloudant&lt;/a&gt;, but I’m revisiting converting BHL page scans to black and white images, partly to clean them up, to make them closer to what a modern reader might expect, and partly to reduce the size of the image. The latter means faster loading times and smaller PDFs for articles.&lt;/p&gt;
&lt;p&gt;The links above explored using foreground image layers from DjVu (less useful now that DjVu is almost dead as a format), and using CSS in web browsers to convert a colour image to gray scale. I’ve also experimented with the approach taken by Google Books (see &lt;a href=&quot;https://github.com/rdmpage/google-book-images&quot;&gt;https://github.com/rdmpage/google-book-images&lt;/a&gt;), which uses &lt;a href=&quot;https://github.com/agl/jbig2enc&quot;&gt;jbig2enc&lt;/a&gt; to compress images and reduce the number of colours.&lt;/p&gt;
&lt;p&gt;In my latest experiments, I use jbig2enc to transform BHL page images into black and white images where each pixel is either black or white (i.e., image depth = 1), then use ImageMagick to resize the image to the Google Books width of 685 pixels and a depth of 2. Typically this gives an image around 25Kb - 30Kb in size. It looks clean and readable.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjqnVGiNhzdranxPXYrymHC3TPL4mtnk7kUJwnZ7Q7JSPmjSwJ3IhdmqdEWonEIcyrluZWZhESRx1mAEujqagP0miCTGkjyeckWZwOFxPGSzHrb6l6biGpSWVpEuP2UNASRKve7UyhHemXT0U2kQDVvAnR3KyeocBKpsH-3Xokg7iK6lbItzLIf/s1112/62994245.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1112&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjqnVGiNhzdranxPXYrymHC3TPL4mtnk7kUJwnZ7Q7JSPmjSwJ3IhdmqdEWonEIcyrluZWZhESRx1mAEujqagP0miCTGkjyeckWZwOFxPGSzHrb6l6biGpSWVpEuP2UNASRKve7UyhHemXT0U2kQDVvAnR3KyeocBKpsH-3Xokg7iK6lbItzLIf/s320/62994245.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhOh6WkijCeij3JFY__k6iya6cBvRxDs7K5yC8Vm_J6Otlrff9RERZjOEPH6yBWxPhgBKJuDHIkq00Qlqhe6CyxReBPRB-VlVjxH0Hem0nJD2biffRxNGMnk8utlabnEV4jDgKTPXn1ZEFAsQmh8IWqgIxHoMiqC3uwtQCDN-Cr8kxtFmDU2Tub/s1112/62994245.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1112&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhOh6WkijCeij3JFY__k6iya6cBvRxDs7K5yC8Vm_J6Otlrff9RERZjOEPH6yBWxPhgBKJuDHIkq00Qlqhe6CyxReBPRB-VlVjxH0Hem0nJD2biffRxNGMnk8utlabnEV4jDgKTPXn1ZEFAsQmh8IWqgIxHoMiqC3uwtQCDN-Cr8kxtFmDU2Tub/s320/62994245.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This approach breaks down for photographs and especially colour plates. For example, this image looks horrible:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiRz_o747fvRqtJy0TEenXJYvrNbxtIyoqMgIWrEM-P_n8QL8WjstAZz4uLwIvEOC_yvw_MFZatTau_NkBEz6_HgSmx-NDA9Ls-sIpZr4jxwHhsdE7uiM1k_FfhuuPE4qtn49ttDV0BNbdQBqxOR0KAmFEpWCWDBmRZUtCIrPp7DxCLntgK99u2/s1184/16281585.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1184&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiRz_o747fvRqtJy0TEenXJYvrNbxtIyoqMgIWrEM-P_n8QL8WjstAZz4uLwIvEOC_yvw_MFZatTau_NkBEz6_HgSmx-NDA9Ls-sIpZr4jxwHhsdE7uiM1k_FfhuuPE4qtn49ttDV0BNbdQBqxOR0KAmFEpWCWDBmRZUtCIrPp7DxCLntgK99u2/s320/16281585.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgre2fYuy1NNqmpVxYdVdjayoOV9V0SgdaWZg7k2V0UPiAdy7F-JL_nCqo_7m2h-vVHqxrlyld89aE2xZbOo_lY9X5nZOMNyvW75z-8QMHmaxD4f-gcjq93B-3D1wgus6uZ9ki3GKRGEs9HIuXM4hHTFEi8qhKe053jxIWF09f29FmiAhf6UMNh/s1184/16281585-bw.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1184&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgre2fYuy1NNqmpVxYdVdjayoOV9V0SgdaWZg7k2V0UPiAdy7F-JL_nCqo_7m2h-vVHqxrlyld89aE2xZbOo_lY9X5nZOMNyvW75z-8QMHmaxD4f-gcjq93B-3D1wgus6uZ9ki3GKRGEs9HIuXM4hHTFEi8qhKe053jxIWF09f29FmiAhf6UMNh/s320/16281585-bw.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;When compressing images that have photos or illustrations jbig2enc can extract the part of the image that includes the illustration, for example:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjxBT0I2vddsFPCopzL-4VsTIfh7gN30lcA9qI5i_h7yU8SJNmy5QxpKcVA4ntD4FyL5IRL2gR0flH3zwt0gDXG21l5ZghNRHf8FkblxrM-q_a20RZIapyi9zFhAk60Fq8TZ3N3gYsuuZv99W4YrZp9Z8uhDDIm8xKz1x3xA3meH-Aw4ZJvIxAI/s3184/output.0000.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;3184&quot; data-original-width=&quot;1840&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjxBT0I2vddsFPCopzL-4VsTIfh7gN30lcA9qI5i_h7yU8SJNmy5QxpKcVA4ntD4FyL5IRL2gR0flH3zwt0gDXG21l5ZghNRHf8FkblxrM-q_a20RZIapyi9zFhAk60Fq8TZ3N3gYsuuZv99W4YrZp9Z8uhDDIm8xKz1x3xA3meH-Aw4ZJvIxAI/s320/output.0000.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This isn’t perfect, but it raises the possibility that we can convert text and line drawings to black and white, and then add back photographs and plates (whether black or white, or colour). After some experimentation using tools such as &lt;a href=&quot;https://imagemagick.org/script/composite.php&quot;&gt;ImageMagick composite&lt;/a&gt; I have a simple workflow:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;compress page image using jbig2enc&lt;/li&gt;
&lt;li&gt;take the extracted illustration and set all white pixels to be transparent&lt;/li&gt;
&lt;li&gt;convert the black and white image output by jbig2enc to colour (required for the next step)&lt;/li&gt;
&lt;li&gt;create a composite image by overlaying the extracted illustration (now on a transparent background) on top of the black-and-white page image&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The result looks passable:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhciZA-JXbBBB2xuusLFx2ndMhFxb2BrQ_1LmG7kfqboWoknsNw_AEy_vvD_Gklbfu9wLpfvwd18370wIKV24C6rYVTFEXklm1xPGi5MZWE2G5V5sR43_BBpo2YgL-rOA717fuCklvfXZuAO3-tiDC5yi153qOjr8AF-e3bqiZzeEWaaf1H7HUv/s1184/16281585-c.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1184&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhciZA-JXbBBB2xuusLFx2ndMhFxb2BrQ_1LmG7kfqboWoknsNw_AEy_vvD_Gklbfu9wLpfvwd18370wIKV24C6rYVTFEXklm1xPGi5MZWE2G5V5sR43_BBpo2YgL-rOA717fuCklvfXZuAO3-tiDC5yi153qOjr8AF-e3bqiZzeEWaaf1H7HUv/s320/16281585-c.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;In this case, we still have a lot of the sepia-toned background, the illustration hasn’t been cleanly separated, but we do at least get some colour.&lt;/p&gt;
&lt;p&gt;Still work to do, but it looks promising and suggests a way to make dramatically smaller PDFs of BHL content. There are crude code and example files in &lt;a href=&quot;https://github.com/rdmpage/bhl-image-transforming&quot;&gt;GitHub&lt;/a&gt;.&lt;/p&gt;

&lt;h2&gt;Update&lt;/h2&gt;

&lt;p&gt;Some Googling turned up &lt;a href=&quot;https://www.imagemagick.org/discourse-server/viewtopic.php?t=14081&quot;&gt;Removing orange tint-mask from color-negatives&lt;/a&gt;, which gives us the following command:&lt;/p&gt;

&lt;p&gt;&lt;code&gt;
convert 16281585.jpg -negate -channel all -normalize -negate -channel all 16281585-rgb.jpg
&lt;/code&gt;
&lt;/p&gt;

&lt;p&gt;Applying this to our image results in:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiRz_o747fvRqtJy0TEenXJYvrNbxtIyoqMgIWrEM-P_n8QL8WjstAZz4uLwIvEOC_yvw_MFZatTau_NkBEz6_HgSmx-NDA9Ls-sIpZr4jxwHhsdE7uiM1k_FfhuuPE4qtn49ttDV0BNbdQBqxOR0KAmFEpWCWDBmRZUtCIrPp7DxCLntgK99u2/s1184/16281585.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1184&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiRz_o747fvRqtJy0TEenXJYvrNbxtIyoqMgIWrEM-P_n8QL8WjstAZz4uLwIvEOC_yvw_MFZatTau_NkBEz6_HgSmx-NDA9Ls-sIpZr4jxwHhsdE7uiM1k_FfhuuPE4qtn49ttDV0BNbdQBqxOR0KAmFEpWCWDBmRZUtCIrPp7DxCLntgK99u2/s320/16281585.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjE_JKD-JFqZfQg2Da9V2186VuhY8pQx6owid6t6d3mhxHUAlniwm0aKOQsCFDdHNoS38fzIq_h9xosw-0xfAbiZVxXZZMfvZWwjM1nZerekHEX4bagPYWVoeB9FwX-y_qANi7MpO_eMPB64Wn_lqH943t5S_YoOajwD0BamGeTUUREi1cHmBG/s1184/16281585-rgb.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;1184&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjE_JKD-JFqZfQg2Da9V2186VuhY8pQx6owid6t6d3mhxHUAlniwm0aKOQsCFDdHNoS38fzIq_h9xosw-0xfAbiZVxXZZMfvZWwjM1nZerekHEX4bagPYWVoeB9FwX-y_qANi7MpO_eMPB64Wn_lqH943t5S_YoOajwD0BamGeTUUREi1cHmBG/s320/16281585-rgb.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This looks a lot better. Results will vary depending on the eveness of the page scan (i.e., is there a shadow on the image), but I think this gives us a way to display the plates with a higher degree of contrast.&lt;/p&gt;

&lt;h2 id=&quot;reading&quot;&gt;Reading&lt;/h2&gt;
&lt;p&gt;Adam Langley, Dan S. Bloomberg, “Google Books: making the public domain universally accessible”, Proc. SPIE 6500, Document Recognition and Retrieval XIV, 65000H (2007/01/29); &lt;a href=&quot;http://dx.doi.org/10.1117/12.710609&quot;&gt;doi:10.1117/12.710609&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/473359528456433637'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/473359528456433637'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/04/notes-on-transforming-bhl-images.html' title='Notes on transforming BHL images'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjqnVGiNhzdranxPXYrymHC3TPL4mtnk7kUJwnZ7Q7JSPmjSwJ3IhdmqdEWonEIcyrluZWZhESRx1mAEujqagP0miCTGkjyeckWZwOFxPGSzHrb6l6biGpSWVpEuP2UNASRKve7UyhHemXT0U2kQDVvAnR3KyeocBKpsH-3Xokg7iK6lbItzLIf/s72-c/62994245.jpg" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-7837499708434790131</id><published>2024-03-27T12:35:00.005+00:00</published><updated>2024-03-27T15:50:33.450+00:00</updated><title type='text'>Hugging Face Autotrain</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Hugging Face Autotrain &lt;a href=&quot;https://doi.org/10.59350/7p1n4-wdv84&quot;&gt;https://doi.org/10.59350/7p1n4-wdv84&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjTIwCiqC0jxCWDl-4YVDXIjsuIBmJ0RGVpdlTY7UY3uu5gNGHD2jTecrimuZMbrHTuIz2zTLfWCjqq6Oi-GeXUg5h9Yc2EWN3ZdeqpFf51AAgHMdaeKgNmI2owslQFWqI7GPD9I9-l_tmv7_HIr75-yWwlhEN9Ybx-mv9Tcz0syjztUTen_Iw9/s1024/hf-logo.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; clear: right; float: right;&quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;128&quot; data-original-height=&quot;1024&quot; data-original-width=&quot;1024&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjTIwCiqC0jxCWDl-4YVDXIjsuIBmJ0RGVpdlTY7UY3uu5gNGHD2jTecrimuZMbrHTuIz2zTLfWCjqq6Oi-GeXUg5h9Yc2EWN3ZdeqpFf51AAgHMdaeKgNmI2owslQFWqI7GPD9I9-l_tmv7_HIr75-yWwlhEN9Ybx-mv9Tcz0syjztUTen_Iw9/s200/hf-logo.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;These are notes to myself on using &lt;a href=&quot;https://huggingface.co/docs/autotrain/en/index&quot;&gt;Hugging Face AutoTrain&lt;/a&gt;. The first version of this had a very nice interface where you could simply upload a folder of images and train a model. It was limited in the range of tasks and models, but made up for that in ease of use. Now AutoTrain has been replaced by &lt;a href=&quot;https://huggingface.co/new-space?template=autotrain-projects/autotrain-advanced&quot;&gt;AutoTrain Advanced&lt;/a&gt;, which &lt;a href=&quot;https://discuss.huggingface.co/t/easy-autotrain/70027&quot;&gt;not everyone is happy about&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&quot;training-a-model&quot;&gt;Training a model&lt;/h2&gt;
&lt;p&gt;After a bit of fussing about (and paying attention to the log messages) I’ve managed to train a model to classify images in much the same way as before. The steps are as follows:&lt;/p&gt;
&lt;p&gt;Go to &lt;a href=&quot;https://huggingface.co/new-space?template=autotrain-projects/autotrain-advanced&quot;&gt;AutoTrain Advanced&lt;/a&gt;. You should see a screen like this:&lt;/p&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiYnQo_BGjsOoXav1rr4vvoQYFvqr4xuEshhGcSH4UQRAePILU4FenuqLCO3KFZdqnwSD8-3S2aIDEaqA8_ALT72EWpDEJ9B2HFTqyV81b6mRgYMV9kKdFbdeRT6y-0FSciqyiYOYuZeZPiruduiGvo8ncsP6_tHAUPvwvPUcoWp3x4BcAGmrOZ/s1162/Screenshot%202024-03-27%20at%2011.57.26.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;600&quot; data-original-height=&quot;1162&quot; data-original-width=&quot;685&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiYnQo_BGjsOoXav1rr4vvoQYFvqr4xuEshhGcSH4UQRAePILU4FenuqLCO3KFZdqnwSD8-3S2aIDEaqA8_ALT72EWpDEJ9B2HFTqyV81b6mRgYMV9kKdFbdeRT6y-0FSciqyiYOYuZeZPiruduiGvo8ncsP6_tHAUPvwvPUcoWp3x4BcAGmrOZ/s600/Screenshot%202024-03-27%20at%2011.57.26.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;By default &lt;code&gt;Docker&lt;/code&gt; and &lt;code&gt;AutoTrain&lt;/code&gt; are selected. It will also show the free hardware spec (&lt;code&gt;CPU basic • 2 vCPU • 16GB&lt;/code&gt;). I found that for image classification this hardware choice would cause AutoTrain to fail, so I selected &lt;code&gt;Nvidia T4 small • 4 vCPU • 15GB&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;Give your space a name and click on &lt;code&gt;Create Space&lt;/code&gt; to create the space.  You will now see something like this:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiUW8W2HR0Ximz4lW9h4Qz3fUaUs8d-6Ass9WjYy6U04lw3cVgGTY7miQ6j6NFrsjdFHNWuNas4Ucx5uIHRc42IyqI7xx8aUQAiJ_V02JfAtju51sG1732AZvrcg00IHPjY_8Gr7VXrvBtqL4zTdbCsH71Ho-MJsWb3zENu6S3vWvqtaM2ZvztL/s1131/Screenshot%202024-03-27%20at%2012.02.56.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;499&quot; data-original-width=&quot;1131&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiUW8W2HR0Ximz4lW9h4Qz3fUaUs8d-6Ass9WjYy6U04lw3cVgGTY7miQ6j6NFrsjdFHNWuNas4Ucx5uIHRc42IyqI7xx8aUQAiJ_V02JfAtju51sG1732AZvrcg00IHPjY_8Gr7VXrvBtqL4zTdbCsH71Ho-MJsWb3zENu6S3vWvqtaM2ZvztL/s400/Screenshot%202024-03-27%20at%2012.02.56.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;It took 3-4 minutes to build the space. Once the space is built you will then be asked to log in to Hugging Face (seems odd, but that’s what it asks you to do). You are then asked to give your space permissions to connect to your account.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjqUbo_WFw2TUPMNGUocm9Z-QpYcKLeFxcD_K3G7O-hiDKlPp56IXrxVctd-b2y8TRrho6bsTeucI9UczZ62eAGHtjRV028_STHKss5NQ7OmWonxBAFszh-dusRcvwmTzH1lnMYexZIBQ2AIvcRsB6NnAsQDu2nU16WBg7opLaTmTMdN7yoERoe/s919/Screenshot%202024-03-27%20at%2012.07.38.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;919&quot; data-original-width=&quot;652&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjqUbo_WFw2TUPMNGUocm9Z-QpYcKLeFxcD_K3G7O-hiDKlPp56IXrxVctd-b2y8TRrho6bsTeucI9UczZ62eAGHtjRV028_STHKss5NQ7OmWonxBAFszh-dusRcvwmTzH1lnMYexZIBQ2AIvcRsB6NnAsQDu2nU16WBg7opLaTmTMdN7yoERoe/s400/Screenshot%202024-03-27%20at%2012.07.38.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Now you will see a slightly scary looking interface (this is one reason why people miss the old “easy” AutoTrain). &lt;/p&gt; 
  
&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiYBJqJUYpVWZqr6_ehNc9wFSiuHDziC2AmO0_FA9GUvb8ekMw2mFdczyx2XInQKZd8JgUcyf2zQxkwaMx6rAb88_22sbb3n8Pp0hJvoBEyJXpDXu3KiZtmXrKRbjw79TUR1TjifQZRHSy0xnTe8s6HLkl9jRjUZ8wW-xyX-oSn0qccx5aaYcYf/s1074/Screenshot%202024-03-27%20at%2012.08.24.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;500&quot; data-original-height=&quot;851&quot; data-original-width=&quot;1074&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiYBJqJUYpVWZqr6_ehNc9wFSiuHDziC2AmO0_FA9GUvb8ekMw2mFdczyx2XInQKZd8JgUcyf2zQxkwaMx6rAb88_22sbb3n8Pp0hJvoBEyJXpDXu3KiZtmXrKRbjw79TUR1TjifQZRHSy0xnTe8s6HLkl9jRjUZ8wW-xyX-oSn0qccx5aaYcYf/s600/Screenshot%202024-03-27%20at%2012.08.24.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
  
&lt;p&gt;For &lt;code&gt;Task&lt;/code&gt; I selected &lt;code&gt;Image Classification&lt;/code&gt; and the default base model (&lt;code&gt;google/vit-base-patch16-224&lt;/code&gt;). I ignored every other setting, and simply uploaded the training data. This was a zip file containing separate folders for each category of image, so that images, say of cats, would be in a folder called &lt;code&gt;cats&lt;/code&gt;, pictures of dogs would be in &lt;code&gt;dogs&lt;/code&gt;, etc.&lt;/p&gt;
&lt;p&gt;I then clicked &lt;code&gt;Start&lt;/code&gt; and after a warning that this would cost money (I subscribe to Hugging Face)saw this:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgvMPG6Vm8oS3V8Z1XGXhUroa5JWjdwV_BVoCzKbUegSYsi-_C_K-ukZj5rBf59TzL52p0VXDMRAwt9BzDbv7GMGIe-ddvUbS-SVU97-ahLZQAv8DQswrWDKHYnYLLHXWAdO-wX24GdyA9PKKC9ek9iW3zIwMKYDcNB_U5yhsRCau97qyXVvDDV/s469/Screenshot%202024-03-27%20at%2012.13.03.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;155&quot; data-original-width=&quot;469&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgvMPG6Vm8oS3V8Z1XGXhUroa5JWjdwV_BVoCzKbUegSYsi-_C_K-ukZj5rBf59TzL52p0VXDMRAwt9BzDbv7GMGIe-ddvUbS-SVU97-ahLZQAv8DQswrWDKHYnYLLHXWAdO-wX24GdyA9PKKC9ek9iW3zIwMKYDcNB_U5yhsRCau97qyXVvDDV/s400/Screenshot%202024-03-27%20at%2012.13.03.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;You can track progress in the logs, which you can see using the middle of the buttons below.&lt;/p&gt;


&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgebBjavFH_GNUI_6X5P3UhrOsnCZU-BDY97emnT9eaVbqC4T_tGaPkfn_mhNxqqP4nrI4ExwTPUWuFkB8TGsxjE9WOaUxDYh5BMAwtKUCKBsnZ6BBQDiUH-kHhfCUq7xC3YggcXL-0xxkkMxqRRSgSxDuvazHrq4kcwz5Cq5XoUYunS6UE0I0w/s177/Screenshot%202024-03-27%20at%2012.13.48.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;42&quot; data-original-width=&quot;177&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgebBjavFH_GNUI_6X5P3UhrOsnCZU-BDY97emnT9eaVbqC4T_tGaPkfn_mhNxqqP4nrI4ExwTPUWuFkB8TGsxjE9WOaUxDYh5BMAwtKUCKBsnZ6BBQDiUH-kHhfCUq7xC3YggcXL-0xxkkMxqRRSgSxDuvazHrq4kcwz5Cq5XoUYunS6UE0I0w/s400/Screenshot%202024-03-27%20at%2012.13.48.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Once completed, the space pauses, which is a little alarming but simply means that it has finished training. Yay, you now have a trained model!&lt;/p&gt;
&lt;p&gt;When I first tried this, I got errors because I didn’t upload the data in the proper format (my zip file had a folder that contained the training data folders, it needs the folders to be in the root of the zip archive). It also failed to train on the base (free) hardware, I only discovered this by looking at the logs and see error messages regarding the lack of a GPU.&lt;/p&gt;
&lt;h2 id=&quot;what-now&quot;&gt;What now?&lt;/h2&gt;
&lt;p&gt;The other thing about the original AutoTrain was that it gave you an app to explore how you model worked on other data. The new AutoTrain simply pauses after training and you are left with “um, what do I do now?”&lt;/p&gt;
&lt;p&gt;After some fussing I discovered that in my profile I now had a brand new &lt;code&gt;Model&lt;/code&gt; appearing in my list of models.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi_QJrYR1bLBqrtIN5XEGUBZKwMLJRBNqhgDC-GhcNl87KCpeegtGeulmOOz1NbpMZy3hZLqNmYco49MaNtM15Ww2zlcQnS95vr0B4UqDXNHsm3kHh7q7jIzDtwNRhjvpuu9s_4tTXA_5qDCNLlnp6AP_jZH2Fr2cWs1_KiJ3Xvg5v-jIa7dcbF/s679/Screenshot%202024-03-27%20at%2012.19.23.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;320&quot; data-original-height=&quot;85&quot; data-original-width=&quot;679&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi_QJrYR1bLBqrtIN5XEGUBZKwMLJRBNqhgDC-GhcNl87KCpeegtGeulmOOz1NbpMZy3hZLqNmYco49MaNtM15Ww2zlcQnS95vr0B4UqDXNHsm3kHh7q7jIzDtwNRhjvpuu9s_4tTXA_5qDCNLlnp6AP_jZH2Fr2cWs1_KiJ3Xvg5v-jIa7dcbF/s320/Screenshot%202024-03-27%20at%2012.19.23.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;If I click on the model I go to the model page, where there is a &lt;code&gt;Deploy&lt;/code&gt; button, this is how you get an app. First though, make sure your model is publicly visible (by default it is private). Click on &lt;code&gt;Settings&lt;/code&gt; and go to the &lt;code&gt;Change model visibility&lt;/code&gt; to make it public. If you now click on the &lt;code&gt;Deploy&lt;/code&gt; button you will see a list of options:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgvjXgwVYloK4pCrnqR7NI5LUxQTWwXflMNzkZhh8GVYoR0UqLQcd158Md0z1W6qDD05j2mzDIbGBSVRGjVYWZtWoOHsNWDbKI3nMGNWgDajjqE17A_vTm5ASuYPRNTeH4up73gULK0BWAcflQWVXamQ2SvdoaSREzzbKg3OIb0_l0JkGsG3Ykb/s357/Screenshot%202024-03-27%20at%2012.20.14.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;357&quot; data-original-width=&quot;322&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgvjXgwVYloK4pCrnqR7NI5LUxQTWwXflMNzkZhh8GVYoR0UqLQcd158Md0z1W6qDD05j2mzDIbGBSVRGjVYWZtWoOHsNWDbKI3nMGNWgDajjqE17A_vTm5ASuYPRNTeH4up73gULK0BWAcflQWVXamQ2SvdoaSREzzbKg3OIb0_l0JkGsG3Ykb/s400/Screenshot%202024-03-27%20at%2012.20.14.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;I picked &lt;code&gt;Spaces&lt;/code&gt;. This enables you to create a simple online app. I accepted all the defaults (including the base, free hardware with no GPU) and in a couple of minutes you get a app that looks like this:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi7DLf8HIjipRh_oIh015LbNWcmx5BnpOay5SP9IMvBeJ9hY4mHEyO4skivLD3pke0KYsohR77sGb4uuzZHsKSVGc7oSO4BuaWncwLOc9r5gw9jwZBh-XDOqmynf4TDAXzx9mVttHT4I-LKfxQGmX3Amy-aoi2VEViisjrs0GXStSiGeBAS_cca/s1012/Screenshot%202024-03-27%20at%2012.27.15.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;567&quot; data-original-width=&quot;1012&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi7DLf8HIjipRh_oIh015LbNWcmx5BnpOay5SP9IMvBeJ9hY4mHEyO4skivLD3pke0KYsohR77sGb4uuzZHsKSVGc7oSO4BuaWncwLOc9r5gw9jwZBh-XDOqmynf4TDAXzx9mVttHT4I-LKfxQGmX3Amy-aoi2VEViisjrs0GXStSiGeBAS_cca/s400/Screenshot%202024-03-27%20at%2012.27.15.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Upload an image, press &lt;code&gt;Submit&lt;/code&gt; and you will get a classification of that image:&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEixyPAnizQ39QYwbNRg8TmbmjF7BjXAXVXMaQmDCkbZdzurTLE7Sqyv4pRESC68cQUGmHbnBPYClJ7eU8eTC-EdthUxGRhpvqyKjAOFLKcjiY1tflKH9cGCO76SlFgsSkGbPu3f0qgFOGyuEaSWmppCxUz_GHkNrAtN3zlxEQ7m7zjzmxp7TaI3/s1005/Screenshot%202024-03-27%20at%2012.27.40.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;751&quot; data-original-width=&quot;1005&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEixyPAnizQ39QYwbNRg8TmbmjF7BjXAXVXMaQmDCkbZdzurTLE7Sqyv4pRESC68cQUGmHbnBPYClJ7eU8eTC-EdthUxGRhpvqyKjAOFLKcjiY1tflKH9cGCO76SlFgsSkGbPu3f0qgFOGyuEaSWmppCxUz_GHkNrAtN3zlxEQ7m7zjzmxp7TaI3/s400/Screenshot%202024-03-27%20at%2012.27.40.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;Apps tend to sleep, so it may be that you come back to an app, load and image, and get an error message that the model is still loading. Wait a moment, try again, and it should work.&lt;/p&gt;
&lt;h2 id=&quot;api&quot;&gt;API&lt;/h2&gt;
&lt;p&gt;Using the app is fun, but if you wasn’t to use the model to classify lots of images then you want to use the API. The &lt;code&gt;Deploy&lt;/code&gt; button lists `Inferences API (serverless) as an option. Clicking on that gives you the URL you can to POST images to, it will return the results in JSON. As with the app, if the model is sleeping then your first call may through an error, typically wait a moment and try again, and then you can classify images in bulk.&lt;/p&gt;

&lt;h2&gt;Summary&lt;/h2&gt;
&lt;p&gt;Hugging Face is quite an extraordinary tool, and it is a way to try and make sense of the xplosiuon of AI techniques available. But it is clearly written by developers for developers, and that can make it intimidating, even for someone like me who writes code, uses GitHub, etc. The original AutoTrain was a joy to use in comparison, and this feels like a missed opportunity where Hugging Face could have keep both the old &quot;easy&quot; version alongside the new, more powerful, but rather clunkier &quot;advanced&quot; version. Still, this is easier than dealing directly with the &lt;a href=&quot;https://xkcd.com/1987/&quot;&gt;hellscape that is Python.&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7837499708434790131'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/7837499708434790131'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/03/hugging-face-autotrain.html' title='Hugging Face Autotrain'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjTIwCiqC0jxCWDl-4YVDXIjsuIBmJ0RGVpdlTY7UY3uu5gNGHD2jTecrimuZMbrHTuIz2zTLfWCjqq6Oi-GeXUg5h9Yc2EWN3ZdeqpFf51AAgHMdaeKgNmI2owslQFWqI7GPD9I9-l_tmv7_HIr75-yWwlhEN9Ybx-mv9Tcz0syjztUTen_Iw9/s72-c/hf-logo.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-1332902500471093017</id><published>2024-02-20T15:32:00.008+00:00</published><updated>2024-03-27T15:49:24.966+00:00</updated><title type='text'>Problems with the DataCite Data Citation Corpus</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2024). Problems with the DataCite Data Citation Corpus &lt;a href=&quot;https://doi.org/10.59350/t80g1-xys37&quot;&gt;https://doi.org/10.59350/t80g1-xys37&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;DataCite have released the &lt;a href=&quot;https://makedatacount.org/data-citation&quot;&gt;Data Citation Corpus&lt;/a&gt;, together with a &lt;a href=&quot;http://corpus.datacite.org/dashboard&quot;&gt;dashboard&lt;/a&gt; that summarises the corpus. This is billed as:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;A trusted central aggregate of all data citations to further our  &lt;strong&gt;understanding of data usage&lt;/strong&gt;  and advance meaningful data metrics&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;The goal is to build a citation database between scholarly articles and data, such as datasets in repositories, sequences in GenBank, protein structures in PDB, etc. Access to the corpus can be obtained by submitting a form, then having a (very pleasant) conversation with DataCite about the nature of the corpus. This process feels clunky because it introduces friction. If you want people to explore this, why not make it a simple download?&lt;/p&gt;
&lt;p&gt;I downloaded the corpus, which is nearly 7 Gb of JSON, formatted as an array(!), thankfully with one citation per line so it is reasonably easy to parse. (&lt;a href=&quot;https://jsonlines.org&quot;&gt;JSON Lines&lt;/a&gt; would be more convenient).&lt;/p&gt;
&lt;p&gt;I loaded this into a SQLite database to make it easier to query, and I have some thoughts. Before outling why I think the corpus has serious problems, I should emphasise that I’m a big fan of what DataCite are trying to do. Being able to track data usage to give credit to researchers and repositories (citations to data as well as papers), to track provenance of data (e.g., when a GenBank sequence turns out to be wrong being able to find all the studies that used it), and to find addition links between papers beyond bibliographic links (e.g., when data is cited but not the original publication) are all good things. Obviously, lots of people have talked about this, but this is my blog so I’ll cite myself as an example 😉.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Page, R. Visualising a scientific article. &lt;em&gt;Nat Prec&lt;/em&gt; (2008). &lt;a href=&quot;https://doi.org/10.1038/npre.2008.2579.1&quot;&gt;https://doi.org/10.1038/npre.2008.2579.1&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;My main interest in the corpus is tracking citations of DNA sequences, which are often not linked to even the original publication in GenBank. I was hopeful the corpus could help in this work.&lt;/p&gt;
&lt;p&gt;Ok, let’s now look at the actual corpus.&lt;/p&gt;
&lt;h2 id=&quot;data-structure&quot;&gt;Data structure&lt;/h2&gt;
&lt;p&gt;Each citation comprises a JSON object, with a mix of external identifiers such as DOIs, and internal identifiers as UUIDs. The later are numerous, and make the data file much bigger than it needs to be. For example, there are two sources of citation data, DataCite, and the Chan Zuckerberg Initiative. These have &lt;code&gt;sourceId&lt;/code&gt; values of &lt;code&gt;3644e65a-1696-4cdf-9868-64e7539598d2&lt;/code&gt; and &lt;code&gt;c66aafc0-cfd6-4bce-9235-661a4a7c6126&lt;/code&gt;, respectively. There are a little over 10 million citations in the corpus, so that’s a lot of bytes that could simply have been &lt;code&gt;1&lt;/code&gt; or &lt;code&gt;2&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;More frustrating than the wasted space is the lack of any list of what each UUID means. I figured out that &lt;code&gt;3644e65a-1696-4cdf-9868-64e7539598d2&lt;/code&gt; is DataCite only by looking at the data, knowing that CZI had contributed more ecords than DataCite. For other entities such as repositories and publishers, one has to go spelunking in the data to make reasonable guesses as to what the repositories are. Given that most citations seem to be to biomedical entities, why not use something such as the compact identifiers from &lt;a href=&quot;https://identifiers.org&quot;&gt;Identifiers.org&lt;/a&gt; for each reppository?&lt;/p&gt;
&lt;h2 id=&quot;dashboard&quot;&gt;Dashboard&lt;/h2&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhWpilOoLPSxgTnxeWyCXx39ef-LwSoQ-rkl7dowe8SLIGcAVD0gcNi8Z0DxSg6hU7qDJJmI8EaLK0atYZF3UTRQsTQIGh0G9MULOC48OTe3g2HOqzh9RvouDTSlqvUiEiMAhE_4K4KPgEnchi6DvdwobW8tJKIMOUnwlxv5Dk4GcJCIsqkGMyM/s1788/Screenshot%202024-02-20%20at%2011.58.57.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;1788&quot; data-original-width=&quot;1534&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhWpilOoLPSxgTnxeWyCXx39ef-LwSoQ-rkl7dowe8SLIGcAVD0gcNi8Z0DxSg6hU7qDJJmI8EaLK0atYZF3UTRQsTQIGh0G9MULOC48OTe3g2HOqzh9RvouDTSlqvUiEiMAhE_4K4KPgEnchi6DvdwobW8tJKIMOUnwlxv5Dk4GcJCIsqkGMyM/s400/Screenshot%202024-02-20%20at%2011.58.57.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;
&lt;p&gt;DataCite provides a  &lt;a href=&quot;http://corpus.datacite.org/dashboard&quot;&gt;dashboard&lt;/a&gt; to summarise key features of the corpus. There are a couple of aspects of the dashboard that I find frustrating.&lt;/p&gt;
&lt;p&gt;Firstly, the “citation counts by subject” is misleading. A quick glance suggests that law and sociology are the subjects that most actively cite data. This would be surprising, especially given that much of the data generated by CZI comes from PubMed Central. Only 50,000 citations out of 10 million comprise articles with subject tags, so this chart is showing results for approximately 0.5% of the corpus. The chart includes the caveat “The visualization includes the top 20 subiects where metadata is available.” but omits to tell us that as a result the chart is irrelevant for &amp;gt;99% of the data.&lt;/p&gt;
&lt;p&gt;The dashboard is  interesting in what it says about the stakeholders of this project. We see counts of citations broken down by source (CZI or DataCite), and publisher, but not by repository. This suggests that repositories are second class citizens. Surely they deserve a panel on the dashboard? I suspect researchers are going to be more interested in what kinds of data are being cited than what academic publishers are in the corpus. For instance, 3.75 million (37.5%) citations are to sequences in GenBank, 1.7 million (17.5%) are to the &lt;a href=&quot;https://www.rcsb.org&quot;&gt;Protein Data Bank (PDB)&lt;/a&gt;, and 0.89 million (8.9%) are to SNPs.&lt;/p&gt;
&lt;h2 id=&quot;chan-zuckerberg-initiative-and-ai&quot;&gt;Chan Zuckerberg Initiative and AI&lt;/h2&gt;
&lt;p&gt;The corpus is a collaboration between DataCite and the Chan Zuckerberg Initiative (CZI) and CZI are responsible for the bulk of the data. Unfortunately there is no description of how those citations were extracted from the source papers. Perhaps CZI used something like SciBERT which they employed in earlier work to extract citations to scientific software &lt;a href=&quot;https://arxiv.org/abs/2209.00693&quot;&gt;https://arxiv.org/abs/2209.00693&lt;/a&gt;? We don’t know. One reason this matters is that there are lots of cases where the citations are incorrect, and if we are going to figure out why, we need to know how they were obtained. At present it is simply a black box.&lt;/p&gt;
&lt;p&gt;These are just a few examples of incorrect citations:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;The mouse line &lt;em&gt;Prdm&lt;/em&gt;&lt;sup&gt;&lt;em&gt;11tm1.1ahl&lt;/em&gt;&lt;/sup&gt; is conflated with the PDB identifier &lt;a href=&quot;https://www.rcsb.org/structure/1AHL&quot;&gt;1ahl&lt;/a&gt;, see &lt;a href=&quot;https://hyp.is/c2Xras_KEe6zEGcm97yBRw/journals.plos.org/plosone/article?id=10.1371/journal.pone.0134503&quot;&gt;https://hyp.is/c2Xras_KEe6zEGcm97yBRw/journals.plos.org/plosone/article?id=10.1371/journal.pone.0134503&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;A museum specimen CR00240699 is mistakenly interpreted as a GenBank accession number, see &lt;a href=&quot;https://hyp.is/CGTJcM_kEe674TfyvGLC0A/zookeys.pensoft.net/article/21580/download/pdf/287887&quot;&gt;https://hyp.is/CGTJcM_kEe674TfyvGLC0A/zookeys.pensoft.net/article/21580/download/pdf/287887&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;A grant number Y21026 is is mistakenly interpreted as a GenBank accession number, see &lt;a href=&quot;https://hyp.is/HpVXhs9PEe6D2UMxrIdqJw/bmjopen.bmj.com/content/12/9/e054887&quot;&gt;https://hyp.is/HpVXhs9PEe6D2UMxrIdqJw/bmjopen.bmj.com/content/12/9/e054887&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;The time period 24 hours (24hr) is conflated with a PDB record 24hr that doesn’t exist &lt;a href=&quot;https://hyp.is/dNfqZs9SEe6U2nMOKHb-Pw/journal.waocp.org/article_89819_8835738205ecaaad36eebfa826a17779.pdf&quot;&gt;https://hyp.is/dNfqZs9SEe6U2nMOKHb-Pw/journal.waocp.org/article_89819_8835738205ecaaad36eebfa826a17779.pdf&lt;/a&gt;. There are a lot of these, such as 17&lt;sup&gt;th&lt;/sup&gt;, 2016, etc.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;These are just a few examples I came across while pottering around with the corpus. I’ve not done any large-scale analysis, but one ZooKeys article I came across &lt;a href=&quot;https://doi.org/10.3897/zookeys.739.21580&quot;&gt;https://doi.org/10.3897/zookeys.739.21580&lt;/a&gt; cites 32 entities, only four of which are correct.&lt;/p&gt;
&lt;p&gt;I get that text mining is hard, but I would expect AI would do better than what we could achieve by simply matching dumb regular expressions. For example, surely a tool that claims any measure of intelligence would be able to recognised that this sentence lists grant numbers, not a GenBank accession number?&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Funding  This study was supported by Longhua Hospital Shanghai University of Traditional Chinese Medicine (grant number:  Y21026), and Longhua Hospital Shanghai University of Traditional Chinese Medicine (YW.006.035)&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;As a fallback, we could also check that a given identifier is valid. For example, there is no sequence with the accession number &lt;a href=&quot;https://www.ncbi.nlm.nih.gov/nuccore?term=Y21026&quot;&gt;Y21026&lt;/a&gt;. The set of possible identifiers is finite (if large), why didn’t the corpus check whether each identifier extracted actually existed?&lt;/p&gt;

&lt;h2&gt;Update: major errors found&lt;/h2&gt;

&lt;p&gt;I&#39;ve created a &lt;a href=&quot;https://github.com/rdmpage/data-citation-corpus&quot;&gt;GitHub repo&lt;/a&gt; to keep track of the errors I&#39;m finding.&lt;/p&gt;

&lt;h3&gt;Protein Data Bank&lt;/h3&gt;

&lt;p&gt;The &lt;a href=&quot;https://www.wwpdb.org&quot;&gt;Protein Data Bank&lt;/a&gt; (PDB) is the second largest repository in the corpus with 1,729,783 citations. There are 177,220 distinct PDB identifiers cited. These identifiers should match the pattern &lt;code&gt;/^[0-9][A-Za-z0-9]{3}$/&lt;/code&gt;, that is, a number 0-9 followed by three alphanumeric characters.  However 31,612 (18%) do not. Examples include &quot;//osf.io/6bvcq&quot; and &quot;//evs.nci.nih.gov/ftp1/CTCAE/CTCAE_4.03/Archive/CTCAE_4.0_2009-05-29_QuickReference_8.5x11.pdf&quot;. So the tools for finding PDB citations do not understand what a PDB identifier should look like.&lt;/p&gt;

&lt;p&gt;Out of curiousity I downloaded all the exiting PDB identifiers from &lt;a href=&quot;https://files.wwpdb.org/pub/pdb/holdings/current_file_holdings.json.gz&quot;&gt;https://files.wwpdb.org/pub/pdb/holdings/current_file_holdings.json.gz&lt;/a&gt;, which gave me 216,225 distinct PDB identifiers. Comparing actual PDB identifiers with ones included in the corpus I got 1,233,993 hits, which is 71% of the total in the corpus. Hence over half a million (a little under a third of the PDB citations) appear to be made up.&lt;/p&gt;

&lt;h3&gt;Individual articles&lt;/h3&gt;

&lt;h4&gt;Taxonomic revision of Stigmatomma Roger (Hymenoptera: Formicidae) in the Malagasy region&lt;/h4&gt;

&lt;p&gt;The paper &lt;a href=&quot;https://doi.org/10.3897/BDJ.4.e8032&quot;&gt;https://doi.org/10.3897/BDJ.4.e8032&lt;/a&gt; is credited with citing 126 entities, including 108 sequences and 14 PDB records. None of this is true. The supposed PDB records are figure numbers, e.g. “Fig. 116d” becomes PDB 116d, and the sequence accession numbers are specimen codes or field numbers.&lt;/p&gt;

&lt;h3&gt;Nucleotide sequences&lt;/h3&gt;

&lt;p&gt;Sequence data is the single largest data type cited in the corpus, with 3.8 million citations. I ran a sample of the first 1000 sequences accession numbers in the corpus against GenBank and in 486 cases GenBank didn&#39;t recognise the accession number as valid. So potentially half the sequence citations are wrong.&lt;/p&gt;

&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
&lt;p&gt;I think the Data Citation Corpus is potentially a great resource, but if it is going to be “[a] trusted central aggregate of all data citations” then I think there are a few things it needs to do:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Make the data more easily accessible so that people can scrutinise it without having to jump through hoops&lt;/li&gt;
&lt;li&gt;Tell us how the Chan Zuckerberg Initiative did the entity matching&lt;/li&gt;
&lt;li&gt;Improve the entity matching&lt;/li&gt;
&lt;li&gt;Add a quality control step that validates extracted identifiers&lt;/li&gt;
&lt;li&gt;Expand the dashboard to give users a better sense of what data is being cited&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1332902500471093017'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/1332902500471093017'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2024/02/problems-with-datacite-data-citation.html' title='Problems with the DataCite Data Citation Corpus'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhWpilOoLPSxgTnxeWyCXx39ef-LwSoQ-rkl7dowe8SLIGcAVD0gcNi8Z0DxSg6hU7qDJJmI8EaLK0atYZF3UTRQsTQIGh0G9MULOC48OTe3g2HOqzh9RvouDTSlqvUiEiMAhE_4K4KPgEnchi6DvdwobW8tJKIMOUnwlxv5Dk4GcJCIsqkGMyM/s72-c/Screenshot%202024-02-20%20at%2011.58.57.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-2555847968974825235</id><published>2023-11-29T11:30:00.005+00:00</published><updated>2024-02-07T10:27:07.133+00:00</updated><title type='text'>It&#39;s 2023 - why are we still not sharing phylogenies?</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2023). It’s 2023 - why are we still not sharing phylogenies? &lt;a href=&quot;https://doi.org/10.59350/n681n-syx67&quot;&gt;https://doi.org/10.59350/n681n-syx67&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;A quick note to support a recent Twitter thread &lt;a href=&quot;https://twitter.com/rdmpage/status/1729816558866718796?s=61&amp;amp;t=nM4XCRsGtE7RLYW3MyIpMA&quot;&gt;https://twitter.com/rdmpage/status/1729816558866718796?s=61&amp;amp;t=nM4XCRsGtE7RLYW3MyIpMA&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;The  article “Diversification of flowering plants in space and time” by Dimitrov et al. describes a genus-level phylogeny for 14,244 flowering plant genera. This is a major achievement, and yet neither the tree nor the data supporting that tree are readily available. There is lots of supplementary information (as PDF files), but no machine readable tree or alignment data.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Dimitrov, D., Xu, X., Su, X. &lt;em&gt;et al.&lt;/em&gt; Diversification of flowering plants in space and time. &lt;em&gt;Nat Commun&lt;/em&gt;  &lt;strong&gt;14&lt;/strong&gt;, 7609 (2023). &lt;a href=&quot;https://doi.org/10.1038/s41467-023-43396-8&quot;&gt;https://doi.org/10.1038/s41467-023-43396-8&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;What we have is a link to a &lt;a href=&quot;https://en.geodata.pku.edu.cn/index.php?c=content&amp;amp;a=list&amp;amp;catid=200&quot;&gt;web site&lt;/a&gt; which in turn has a link to a &lt;a href=&quot;https://geodata.pku.edu.cn/OneZoomSeedPlants/seed_plants.htm&quot;&gt;OneZoom visualisation&lt;/a&gt;. If you look at the source code for the web site you can see the phylogeny in Newick format as a Javascript file.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhwn95hB-WqA10yK-CFU120lsc_SiDYHyd1PyqXzUxmWGC7-PdUykllNS1Y3bhEAJzIzXW73nEHhyhDl_wcHekAeUEN9rZIRQl9eoNX91gDzi1aKnORzvIS9pf4m9aElI4lxZ8gNtpbxGyzqm1fYVj7Gh4VrxsoIo9Z1Ii2WarmHgUgrEzSQPzb/s1072/Screenshot%202023-11-29%20at%2011.00.47.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;706&quot; data-original-width=&quot;1072&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhwn95hB-WqA10yK-CFU120lsc_SiDYHyd1PyqXzUxmWGC7-PdUykllNS1Y3bhEAJzIzXW73nEHhyhDl_wcHekAeUEN9rZIRQl9eoNX91gDzi1aKnORzvIS9pf4m9aElI4lxZ8gNtpbxGyzqm1fYVj7Gh4VrxsoIo9Z1Ii2WarmHgUgrEzSQPzb/s400/Screenshot%202023-11-29%20at%2011.00.47.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;This is a far from ideal way to share data. Readers can’t easily get the tree, explore it, evaluate it, or use it in their own analyses. I grabbed the tree and put it online as a GitHub &lt;a href=&quot;https://gist.github.com/rdmpage/dd9ffd3f67b80b1d560cc9448867d92a&quot;&gt;GIST&lt;/a&gt;. Once you have the tree you can do things such as try a different tree viewer, such as &lt;a href=&quot;https://phylocloud.cgmlab.org/tree_page/65670f1aac7dc72a041052c9/&quot;&gt;PhyloCloud&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiLHyspB-1kD3CoCkNUAM470pEF-W3g7XEi_8yrVe_rHnqbLrxk8RcymK6QSbjzN4PkFGzyNv4AxQYY6nHlWRH1cfu4ztgsICTrsUsfk6ZPp1Bssxla_JxkT9oKokScdARUX42kyCsIqRP5K1k1rQGygi9vbvn5BJ-4ab_XR_Cvl9Qi2qGLqAYi/s1341/Screenshot%202023-11-29%20at%2011.03.37.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;1180&quot; data-original-width=&quot;1341&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiLHyspB-1kD3CoCkNUAM470pEF-W3g7XEi_8yrVe_rHnqbLrxk8RcymK6QSbjzN4PkFGzyNv4AxQYY6nHlWRH1cfu4ztgsICTrsUsfk6ZPp1Bssxla_JxkT9oKokScdARUX42kyCsIqRP5K1k1rQGygi9vbvn5BJ-4ab_XR_Cvl9Qi2qGLqAYi/s400/Screenshot%202023-11-29%20at%2011.03.37.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;That is a start, but it’s clearly not ideal. Why didn’t the authors put the tree (and the data) into a proper repository, such as &lt;a href=&quot;https://zenodo.org&quot;&gt;Zenodo&lt;/a&gt; where it would be persistent and citable, and also linked to the authors’ ORCID profile?  That way everybody wins, readers get a tree to explore, the authors have an additional citable output.&lt;/p&gt;
&lt;p&gt;The state of sharing of phylogenetic data is dire, not helped by the slow and painful demise of &lt;a href=&quot;https://treebase.org/&quot;&gt;TreeBASE&lt;/a&gt;. Sharing machine readable trees and datasets still does not seem to be the norm in phylogenetics.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2555847968974825235'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/2555847968974825235'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2023/11/its-2023-why-are-we-still-not-sharing.html' title='It&#39;s 2023 - why are we still not sharing phylogenies?'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhwn95hB-WqA10yK-CFU120lsc_SiDYHyd1PyqXzUxmWGC7-PdUykllNS1Y3bhEAJzIzXW73nEHhyhDl_wcHekAeUEN9rZIRQl9eoNX91gDzi1aKnORzvIS9pf4m9aElI4lxZ8gNtpbxGyzqm1fYVj7Gh4VrxsoIo9Z1Ii2WarmHgUgrEzSQPzb/s72-c/Screenshot%202023-11-29%20at%2011.00.47.png" height="72" width="72"/></entry><entry><id>tag:blogger.com,1999:blog-16081779.post-903668539491051583</id><published>2023-10-26T16:11:00.006+01:00</published><updated>2024-02-07T10:30:02.113+00:00</updated><title type='text'>Where are the plant type specimens? Mapping JSTOR Global Plants to GBIF</title><content type='html'>&lt;div style=&quot;padding:1em;border:1px solid #DDD;border-radius:0.2em;&quot;&gt;How to cite: &lt;strong&gt;Page, R. (2023). Where are the plant type specimens? Mapping JSTOR Global Plants to GBIF. &lt;a href=&quot;https://doi.org/10.59350/m59qn-22v52&quot;&gt;https://doi.org/10.59350/m59qn-22v52&lt;/a&gt;&lt;/strong&gt;&lt;/div&gt;

&lt;p&gt;This blog post documents my attempts to create links between two major resources for plant taxonomy: JSTOR’s &lt;a href=&quot;https://plants.jstor.org&quot;&gt;Global Plants&lt;/a&gt; and &lt;a href=&quot;https://www.gbif.org&quot;&gt;GBIF&lt;/a&gt;, specifically between type specimens in JSTOR and the corresponding occurrence in GBIF. The TL;DR is that I have tried to map 1,354,861 records for type specimens from JSTOR to the equivalent record in GBIF, and managed to find 903,945 (67%) matches.&lt;/p&gt;
&lt;h2 id=&quot;why-do-this&quot;&gt;Why do this?&lt;/h2&gt;
&lt;p&gt;Why do this? Partly because a collaborator asked me, but I’ve long been interested in JSTOR’s Global Plants. This was a massive project to digitise plant type specimens all around the world, generating millions of images of herbarium sheets. It also resulted in a standardised way to refer to a specimen, namely its barcode, which comprises the herbarium code and a number (typically padded to eight digits). These barcodes are converted into JSTOR URLs, so that E00279162 becomes &lt;a href=&quot;https://plants.jstor.org/stable/10.5555/al.ap.specimen.e00279162&quot;&gt;https://plants.jstor.org/stable/10.5555/al.ap.specimen.e00279162&lt;/a&gt;. These same barcodes have become the basis of efforts to create stable identifiers for plant specimens, for example &lt;a href=&quot;https://data.rbge.org.uk/herb/E00279162&quot;&gt;https://data.rbge.org.uk/herb/E00279162&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;JSTOR created an elegant interface to these specimens, complete with links to literature on JSTOR, BHL, and links to taxon pages on GBIF and elsewhere. It also added the ability to comment on individual specimens using &lt;a href=&quot;https://disqus.com&quot;&gt;Disqus&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjaJx_5psYTAPfRLaomxACMxZ5Ant1peCHJixf0PeYP82k2uw9vOq4P2Q-HzSakV8q5aSmfqs0X_J_OJmnnhttknlwVYeEguQaxhk4fWi0lKvh_KzugoZWd_MjHmXcImkuMVbJPxXmwln6-Ah7iy4elXNGhfhmofrZbHrgcc3VC7E7lrVd05Utw/s1032/Screenshot%202023-10-26%20at%2013.30.11.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;666&quot; data-original-width=&quot;1032&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjaJx_5psYTAPfRLaomxACMxZ5Ant1peCHJixf0PeYP82k2uw9vOq4P2Q-HzSakV8q5aSmfqs0X_J_OJmnnhttknlwVYeEguQaxhk4fWi0lKvh_KzugoZWd_MjHmXcImkuMVbJPxXmwln6-Ah7iy4elXNGhfhmofrZbHrgcc3VC7E7lrVd05Utw/s400/Screenshot%202023-10-26%20at%2013.30.11.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;


&lt;p&gt;However, JSTOR Global Plants is not open. If you click on a thumbnail image of a herbarium sheet you hit a paywall.&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgJg1Ek6wzEKStEwOAgm3wgI8sC49qq__akV0290AXG5yo1051G1SyR0wX7Psfba2E8ulEK4qixgvefRfXCxqZt5xilCbqgemLQJqwYsBPUQRhUxqWHaL2dtSO2FIplcujmGFXJMg0GwCEJ4qn5HqUHq93rOBvlVnLX6tJ9iOlwrPf8aeQ64bko/s1041/Screenshot%202023-10-26%20at%2013.30.29.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; width=&quot;400&quot; data-original-height=&quot;701&quot; data-original-width=&quot;1041&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgJg1Ek6wzEKStEwOAgm3wgI8sC49qq__akV0290AXG5yo1051G1SyR0wX7Psfba2E8ulEK4qixgvefRfXCxqZt5xilCbqgemLQJqwYsBPUQRhUxqWHaL2dtSO2FIplcujmGFXJMg0GwCEJ4qn5HqUHq93rOBvlVnLX6tJ9iOlwrPf8aeQ64bko/s400/Screenshot%202023-10-26%20at%2013.30.29.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;In contrast data in GBIF is open. The table below is a simplified comparison of JSTOR and GBIF.&lt;/p&gt;

&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Feature&lt;/th&gt;
&lt;th&gt;JSTOR&lt;/th&gt;
&lt;th&gt;GBIF&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Open or paywall&lt;/td&gt;
&lt;td&gt;Paywall&lt;/td&gt;
&lt;td&gt;Open&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Consistent identifier&lt;/td&gt;
&lt;td&gt;Yes&lt;/td&gt;
&lt;td&gt;No&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Images&lt;/td&gt;
&lt;td&gt;All specimens&lt;/td&gt;
&lt;td&gt;Some specimens&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Types linked to original name&lt;/td&gt;
&lt;td&gt;Yes&lt;/td&gt;
&lt;td&gt;Sometimes&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Community annotation&lt;/td&gt;
&lt;td&gt;Yes&lt;/td&gt;
&lt;td&gt;No&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Can download the data&lt;/td&gt;
&lt;td&gt;No&lt;/td&gt;
&lt;td&gt;Yes&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;API&lt;/td&gt;
&lt;td&gt;No&lt;/td&gt;
&lt;td&gt;Yes&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;p&gt;JSTOR offers a consistent identifier (the barcode), an image, has the type linked to the original name, and community annotation. But there is a paywall, and no way to download data. GBIF is open, enables both bulk download and API access, but often lacks images, and as we shall see below, the identifiers for specimens are a hot mess.&lt;/p&gt;
&lt;p&gt;The “Types linked to original name” feature concerns whether the type specimen is connected to the appropriate name. A type is (usually) the type specimen for a single taxonomic name. For example, E00279162 is the type for &lt;em&gt;Achasma subterraneum&lt;/em&gt; Holttum. This name is now regarded as a synonym of &lt;em&gt;Etlingera subterranea&lt;/em&gt; (Holttum) R. M. Sm. following the transfer to the genus &lt;em&gt;Etlingera&lt;/em&gt;. But E00279162 is not a type for the name &lt;em&gt;Etlingera subterranea&lt;/em&gt;. JSTOR makes this clear by stating that the type is stored under &lt;em&gt;Etlingera subterranea&lt;/em&gt; but is the type for &lt;em&gt;Achasma subterraneum&lt;/em&gt;. However, this information does not make it to GBIF, which tells us that E00279162 is a type for &lt;a href=&quot;https://www.gbif.org/species/2760154&quot;&gt;&lt;em&gt;Etlingera subterranea&lt;/em&gt;&lt;/a&gt; and that it knows of no type specimens for &lt;a href=&quot;https://www.gbif.org/species/2760155&quot;&gt;&lt;em&gt;Achasma subterraneum&lt;/em&gt;&lt;/a&gt;. Hence querying GBIF for type specimens is potentially fraught with error.&lt;/p&gt;
&lt;p&gt;Hence JSTOR has often cleaner and more accurate data. But it is behind a paywall. Hence I set about to get a list of all the type specimens that JSTOR has, and try and match those to GBIF. This would give me a sense of how much content behind JSTOR’s paywall was freely available in GBIF, as well as how much content JSTOR had that was absent from GBIF. I also wanted to use JSTOR’s reference to the original plant name to get around any GBIF’s tendency to link types to the wrong name.&lt;/p&gt;
&lt;h2 id=&quot;challenges&quot;&gt;Challenges&lt;/h2&gt;
&lt;p&gt;Mapping JSTOR barcodes to records in GBIF proved challenging. In an ideal world specimens would have a single identifier that everyone would use when citing or otherwise referring to that specimen. Of course this is not the case. There are all manner of identifiers, ranging from barcodes, collector names and numbers, local database keys (integers, UUIDs, and anything in between). Some identifiers include version codes. All of this greatly complicates linking barcodes to GBIF records. I made extensive use of my &lt;a href=&quot;https://material-examined.herokuapp.com&quot;&gt;Material examined&lt;/a&gt; tool that attempts to translate specimen codes into GBIF records. Under the hood this means lots of regular expressions, and I spent a lot of time adding code to handle all the different ways herbaria manage to mangle barcodes.&lt;/p&gt;
&lt;p&gt;In some cases JSTOR barcodes are absent from the specimen information in the GBIF occurrence record itself but are hidden in metadata for the image (such as the URL to the image). My “Material examined” tool uses the GBIF API, and that doesn’t enable searches for parts of image URLs. Hence for some herbaria I had to download the archive, extract media URLs and look for barcodes. In the process I encountered a subtle bug in Safari that truncated downloads, see &lt;a href=&quot;https://discourse.gbif.org/t/downloads-failing-to-include-all-files-in-the-archive/4159&quot;&gt;Downloads failing to include all files in the archive&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Some herbaria have data in both JSTOR and GBIF, but no identifiers in common (other than collector names and numbers, which would require approximate string matching). But in some cases the herbaria have their own web sites which mention the JSTOR barcodes, as well as the identifiers those herbaria do share with GBIF. In these cases I would attempt to scrape the herbaria web sites, extract the barcode and original identifier, then find the original identifier in GBIF.&lt;/p&gt;
&lt;p&gt;Another observation is that in some cases the imagery in JSTOR is not the same as GBIF. For example &lt;a href=&quot;https://plants.jstor.org/stable/10.5555/al.ap.specimen.lisc002383&quot;&gt;LISC002383&lt;/a&gt; and &lt;a href=&quot;https://www.gbif.org/occurrence/813346859&quot;&gt;813346859&lt;/a&gt; are the same specimens but the images are different. Why are the images provided to JSTOR not being provided to GBIF?&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj_rl3EqL5DF98LVrbGz4qMDFIM7nCr5EA4QUgzjO2cyRjcUlqvopB3RPSJ8J5Xw6XCXirAc8R8nRUZM_WtcY5xbtAX1Op7ZI3Va7JH2oH90cMniQHapcnMWPBiMb7QqAEIwoyZGUiJZ78g-Akei1c6LvN0Sa2cfahoNtz_EiF0L5s6K_bJyIkp/s200/lisc002383.jpg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;200&quot; data-original-width=&quot;152&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj_rl3EqL5DF98LVrbGz4qMDFIM7nCr5EA4QUgzjO2cyRjcUlqvopB3RPSJ8J5Xw6XCXirAc8R8nRUZM_WtcY5xbtAX1Op7ZI3Va7JH2oH90cMniQHapcnMWPBiMb7QqAEIwoyZGUiJZ78g-Akei1c6LvN0Sa2cfahoNtz_EiF0L5s6K_bJyIkp/s320/lisc002383.jpg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhx2z-YmO1MqVdO0qvMaVbpd4fka-jXcl6g_r8mN6QuLgje8KCVuWJWh2gw_fKE9BJ1kFZ9tewx5fbdHkh4Jm1rd8agdl1JFMYAQ9Ku-3fu3OW8emjKm39CV2KTASNNZZHdm9fhrlbAB9S75CxxYmgGElZXVD4M0q8OhJ46elIPFaRYj2SBSznL/s200/b6f10ba1bb2460a11fa9b0d36f1ce0a6.jpeg&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;320&quot; data-original-height=&quot;200&quot; data-original-width=&quot;133&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhx2z-YmO1MqVdO0qvMaVbpd4fka-jXcl6g_r8mN6QuLgje8KCVuWJWh2gw_fKE9BJ1kFZ9tewx5fbdHkh4Jm1rd8agdl1JFMYAQ9Ku-3fu3OW8emjKm39CV2KTASNNZZHdm9fhrlbAB9S75CxxYmgGElZXVD4M0q8OhJ46elIPFaRYj2SBSznL/s320/b6f10ba1bb2460a11fa9b0d36f1ce0a6.jpeg&quot;/&gt;&lt;/a&gt;&lt;/div&gt;


&lt;p&gt;In the process of making this mapping it became clear that there are herbaria that aren’t in GBIF, for example Singapore (SING) is not in GBIF but instead is hosted at Oxford University (!) at &lt;a href=&quot;https://herbaria.plants.ox.ac.uk/bol/sing&quot;&gt;https://herbaria.plants.ox.ac.uk/bol/sing&lt;/a&gt;. There seem to be a number of herbaria that have content in JSTOR but not GBIF, hence GBIF has gaps in its coverage of type specimens.&lt;/p&gt;
&lt;p&gt;Interestingly JSTOR rarely seems to be a destination for links. An exception is the Paris museum, for example specimens &lt;a href=&quot;https://science.mnhn.fr/institution/um/collection/mpu/item/mpu015018&quot;&gt;MPU015018&lt;/a&gt; has a link to JSTOR record for same specimen &lt;a href=&quot;http://plants.jstor.org/specimen/MPU015018&quot;&gt;MPU015018&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&quot;matching-taxonomic-names&quot;&gt;Matching taxonomic names&lt;/h2&gt;
&lt;p&gt;As a check on matching JSTOR to GBIF I would also check that the taxonomic names associated with the two records are the same. The challenge here is that the names may have changed. Ideally both JSTOR and GBIF would have either a history of name changes, or at least the original name the specimen was associated with (i.e., the name for which the specimen is the type). And of course, this isn’t the case. So I relied on a series of name comparisons, such as “are the names the same?”, “if names are different are the specific epithets the same?”, and “if names are specific epithets are different are the generic names the same?”. Because the spelling of species names can change depending on the gender of the genus, I also used some stemming rules to catch names that were the same even if their ending was different.&lt;/p&gt;
&lt;p&gt;This approach will still miss some matches, such as hybrid names, and cases where a specimen is stored under a completely different name (e.g., the original name is a heterotypic synonym of a different name).&lt;/p&gt;
&lt;h2 id=&quot;mapping&quot;&gt;Mapping&lt;/h2&gt;
&lt;p&gt;The mapping made so far is available on GitHub &lt;a href=&quot;https://github.com/rdmpage/jstor-plant-specimens&quot;&gt;https://github.com/rdmpage/jstor-plant-specimens&lt;/a&gt; and Zenodo &lt;a href=&quot;https://doi.org/10.5281/zenodo.10044359&quot;&gt;https://doi.org/10.5281/zenodo.10044359&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;At the time of writing I have retrieved 1,354,861 records for type specimens from JSTOR, of which 903,945 (67%) have been matched to GBIF.&lt;/p&gt;
&lt;p&gt;This has been a sobering lesson in just how far we are from being able to treat specimens as citable things, we simply don’t have decent identifiers for them. JSTOR made a lot of progress, but that has been hampered by being behind a paywall, and the fact that many of these identifiers are being lost or mangled by the time they make their way into GBIF, which is arguably where most people get information on specimens.&lt;/p&gt;
&lt;p&gt;There’s an argument that it would be great to get JSTOR Global Plants into GBIF. It would certainly add a lot of extra images, and also provide a presence for a number of smaller herbaria that aren’t in GBIF. I think there’s also a case to be made for having a &lt;a href=&quot;https://www.gbif.org/hosted-portals&quot;&gt;GBIF hosted portal&lt;/a&gt; for plant type specimens, to help make these valuable objects more visible and discoverable.&lt;/p&gt;

&lt;p&gt;Below is a barchart of the top 50 herbaria ranked by number of type specimens in JSTOR, showing the numbers of specimens mapped to GBIF (red) and those not found (blue).&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh6JvmpGTkkKnfPAaaPaLwfyffVImyk_pN6QYeDHFGKMlC7RgWnxoK6oBtMkvOQF0DxO5f9eoUph3nlKwT0_BoJ-LG089ChLfI_1xWIoBgYFYR0CIAalTgArQLH1rBEecbnWwkDbpv4QKS-5vQL3dRRLqKmcL2TEZNga7yM-cPwdRXSC68MrKGY/s948/JSTOR%20type%20specimens%20in%20GBIF.png&quot; style=&quot;display: block; padding: 1em 0; text-align: center; &quot;&gt;&lt;img alt=&quot;&quot; border=&quot;0&quot; height=&quot;400&quot; data-original-height=&quot;948&quot; data-original-width=&quot;754&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh6JvmpGTkkKnfPAaaPaLwfyffVImyk_pN6QYeDHFGKMlC7RgWnxoK6oBtMkvOQF0DxO5f9eoUph3nlKwT0_BoJ-LG089ChLfI_1xWIoBgYFYR0CIAalTgArQLH1rBEecbnWwkDbpv4QKS-5vQL3dRRLqKmcL2TEZNga7yM-cPwdRXSC68MrKGY/s400/JSTOR%20type%20specimens%20in%20GBIF.png&quot;/&gt;&lt;/a&gt;&lt;/div&gt;

&lt;h2 id=&quot;reading&quot;&gt;Reading&lt;/h2&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Boyle, B., Hopkins, N., Lu, Z. et al. The taxonomic name resolution service: an online tool for automated standardization of plant names. BMC Bioinformatics 14, 16 (2013). &lt;a href=&quot;https://doi.org/10.1186/1471-2105-14-16&quot;&gt;https://doi.org/10.1186/1471-2105-14-16&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;a href=&quot;https://cetaf.org/best-practices/cetaf-stable-identifiers-csi-2/&quot;&gt;CETAF Stable Identifiers (CSI)&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;a href=&quot;http://herbal.rbge.info/md.php?q=implementers&quot;&gt;CETAF Specimen URI Tester&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Holttum, R. E. (1950). The Zingiberaceae of the Malay Peninsula. Gardens’ Bulletin, Singapore, 13(1), 1-249. &lt;a href=&quot;https://biostor.org/reference/163926&quot;&gt;https://biostor.org/reference/163926&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Hyam, R.D., Drinkwater, R.E. &amp;amp; Harris, D.J. Stable citations for herbarium specimens on the internet: an illustration from a taxonomic revision of Duboscia (Malvaceae) Phytotaxa 73: 17–30 (2012). &lt;a href=&quot;https://doi.org/10.11646/phytotaxa.73.1.4&quot;&gt;https://doi.org/10.11646/phytotaxa.73.1.4&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Rees T (2014) Taxamatch, an Algorithm for Near (‘Fuzzy’) Matching of Scientific Names in Taxonomic Databases. PLoS ONE 9(9): e107510. &lt;a href=&quot;https://doi.org/10.1371/journal.pone.0107510&quot;&gt;https://doi.org/10.1371/journal.pone.0107510&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Ryan D (2018) Global Plants: A Model of International Collaboration . Biodiversity Information Science and Standards 2: e28233. &lt;a href=&quot;https://doi.org/10.3897/biss.2.28233&quot;&gt;https://doi.org/10.3897/biss.2.28233&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Ryan, D. (2013), THE GLOBAL PLANTS INITIATIVE CELEBRATES ITS ACHIEVEMENTS AND PLANS FOR THE FUTURE. Taxon, 62: 417-418. &lt;a href=&quot;https://doi.org/10.12705/622.26&quot;&gt;https://doi.org/10.12705/622.26&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;(2016), Global Plants Sustainability: The Past, The Present and The Future. Taxon, 65: 1465-1466. &lt;a href=&quot;https://doi.org/10.12705/656.38&quot;&gt;https://doi.org/10.12705/656.38&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Smith, G.F. and Figueiredo, E. (2013), Type specimens online: What is available, what is not, and how to proceed; Reflections based on an analysis of the images of type specimens of southern African Polygala (Polygalaceae) accessible on the worldwide web. Taxon, 62: 801-806. &lt;a href=&quot;https://doi.org/10.12705/624.5&quot;&gt;https://doi.org/10.12705/624.5&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Smith, R. M. (1986). New combinations in Etlingera Giseke (Zingiberaceae). Notes from the Royal Botanic Garden Edinburgh, 43(2), 243-254.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Anna Svensson; Global Plants and Digital Letters: Epistemological Implications of Digitising the Directors’ Correspondence at the Royal Botanic Gardens, Kew. Environmental Humanities 1 May 2015; 6 (1): 73–102. doi: &lt;a href=&quot;https://doi.org/10.1215/22011919-3615907&quot;&gt;https://doi.org/10.1215/22011919-3615907&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;Written with &lt;a href=&quot;https://stackedit.io/&quot;&gt;StackEdit&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

</content><link rel='edit' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/903668539491051583'/><link rel='self' type='application/atom+xml' href='https://www.blogger.com/feeds/16081779/posts/default/903668539491051583'/><link rel='alternate' type='text/html' href='https://iphylo.blogspot.com/2023/10/where-are-plant-type-specimens-mapping.html' title='Where are the plant type specimens? Mapping JSTOR Global Plants to GBIF'/><author><name>Roderic Page</name><uri>http://www.blogger.com/profile/00269598293846172649</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjaJx_5psYTAPfRLaomxACMxZ5Ant1peCHJixf0PeYP82k2uw9vOq4P2Q-HzSakV8q5aSmfqs0X_J_OJmnnhttknlwVYeEguQaxhk4fWi0lKvh_KzugoZWd_MjHmXcImkuMVbJPxXmwln6-Ah7iy4elXNGhfhmofrZbHrgcc3VC7E7lrVd05Utw/s72-c/Screenshot%202023-10-26%20at%2013.30.11.png" height="72" width="72"/></entry></feed>