<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" version="2.0">

<channel>
	<title>Institute for Modern Intelligence</title>
	
	<link>http://imintel.org/blog</link>
	<description>Developing the Science, Practice, and Governence of Modern Intelligence</description>
	<lastBuildDate>Fri, 18 Jun 2010 16:59:43 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.0</generator>
		<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/imintel" /><feedburner:info uri="imintel" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><item>
		<title>Definitions in Information Management</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/hEBMfaoeEHI/</link>
		<comments>http://imintel.org/blog/2010/06/18/definitions-in-information-management/#comments</comments>
		<pubDate>Fri, 18 Jun 2010 16:59:43 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/?p=270</guid>
		<description><![CDATA[Dr M. Chisholm talks about his wonderful new book &#8220;Definitions in Information Management&#8221;]]></description>
			<content:encoded><![CDATA[<p><a href="http://www.askget.com/index.cfm">Dr M. Chisholm</a> talks about his wonderful new book &#8220;<a href="http://data-definition.com/index.cfm">Definitions in Information Management</a>&#8221;</p>
<p><object width="500" height="400"><param name="movie" value="http://www.youtube.com/v/FoigbjwEfNg&#038;fs=1"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://www.youtube.com/v/FoigbjwEfNg&#038;fs=1" type="application/x-shockwave-flash" width="500" height="400" allowscriptaccess="always" allowfullscreen="true"></embed></object></p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/hEBMfaoeEHI" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/06/18/definitions-in-information-management/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/06/18/definitions-in-information-management/</feedburner:origLink></item>
		<item>
		<title>Dr Yoakum-Stover to speak at 2010 National Association of Broadcasters Show</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/h-G6DvWdID4/</link>
		<comments>http://imintel.org/blog/2010/04/12/dr-yoakum-stover-to-speak-at-2010-national-association-of-broadcasters-show/#comments</comments>
		<pubDate>Mon, 12 Apr 2010 11:05:55 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[News]]></category>
		<category><![CDATA[press release]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/?p=249</guid>
		<description><![CDATA[Tuesday April 13 Dr. Yoakum-Stover will be a panelist at the 2010 National Association of Broadcasters Show,  Military &#38; Government Summit, Trends in Mission Critical Video &#38; Data in Las Vegas, NV.  During her session, Trends in Infrastructure: Commercial vs. Military, Dr. Yoakum-Stover will present her work on Ultra-Large Scale unified data storage and processing [...]]]></description>
			<content:encoded><![CDATA[<p><a href="http://imintel.org/blog/wp-content/uploads/2010/03/IMG_4653.jpg"><img class="alignright size-medium wp-image-228" title="Suzanne Yoakum-Stover" src="http://imintel.org/blog/wp-content/uploads/2010/03/IMG_4653-300x200.jpg" alt="" width="300" height="200" /></a>Tuesday April 13 Dr. Yoakum-Stover will be a panelist at the 2010 National Association of Broadcasters Show,  Military &amp; Government Summit, Trends in Mission Critical Video &amp; Data in Las Vegas, NV.  During her session, Trends in Infrastructure: Commercial vs. Military, Dr. Yoakum-Stover will present her work on Ultra-Large Scale unified data storage and processing and describe an implementation using cloud computing technology developed for the Distributed Common Ground Systems &#8211; Army (DCGS-A) Program of Record.</p>
<p>For more information, please visit <a href="http://www.imintel.org/">www.imintel.org</a>.</p>
<p><strong>About the Institute for Modern Intelligence (IMI)</strong></p>
<p>The IMI, formed in early 2009,  is a non-profit research institute  committed to developing the science, practice, and governance of Modern  Intelligence.  IMI is embracing the evolution of the Intelligence  Community from its emerging theoretical foundations to the panoply of  innovative techniques and the implications for organization and  management.  Today, IMI primarily serves the U.S. Army and Intelligence  Community.</p>
<p><strong> </strong></p>
<p>Contacts<br />
info@imintel.org</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/h-G6DvWdID4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/04/12/dr-yoakum-stover-to-speak-at-2010-national-association-of-broadcasters-show/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/04/12/dr-yoakum-stover-to-speak-at-2010-national-association-of-broadcasters-show/</feedburner:origLink></item>
		<item>
		<title>Suzanne Yoakum-Stover Among Top 5 Information Managers of the Year</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/iUnJy5zSETM/</link>
		<comments>http://imintel.org/blog/2010/03/31/suzanne-yoakum-stover-among-5-managers-of-the-year/#comments</comments>
		<pubDate>Wed, 31 Mar 2010 19:17:35 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/?p=226</guid>
		<description><![CDATA[Company: Institute for Modern Intelligence (IMI) Industry: Research and Development Location: Alexandria, VA, United States of America Alexandria, VA—The Institute for Modern Intelligence (IMI) announced today that Dr. Suzanne Yoakum-Stover, IMI Executive Director, was named as one of the top 5 information managers of the year by Information Management Magazine http://goo.gl/vRjn. Dr. Yoakum-Stover was recognized [...]]]></description>
			<content:encoded><![CDATA[<p><a href="http://imintel.org/blog/wp-content/uploads/2010/03/IMG_4653.jpg"><img class="size-medium wp-image-228 alignright" title="Suzanne Yoakum-Stover" src="http://imintel.org/blog/wp-content/uploads/2010/03/IMG_4653-300x200.jpg" alt="" width="180" height="120" /></a></p>
<p><strong>Company:</strong> Institute for Modern Intelligence (IMI)</p>
<p><strong>Industry:</strong> Research and Development</p>
<p><strong>Location:</strong> Alexandria, VA, United States of America</p>
<p><strong> </strong></p>
<p>Alexandria, VA—The Institute for Modern Intelligence (IMI) announced today that Dr. Suzanne Yoakum-Stover, IMI Executive Director, was named as one of the top 5 information managers of the year by Information Management Magazine <a href="http://goo.gl/vRjn">http://goo.gl/vRjn</a>.</p>
<p>Dr. Yoakum-Stover was recognized for her work creating a practical, Ultra-Large Scale (ULS) systems solution – a unified dataspace &#8211; for data storage, exploration, enrichment, and exploitation that accommodates the diversity of data, semantics, and perspectives without information loss or distortion. Through an innovation that provides a generic interface to all data, she aims to enable all manner of  processing to be put into production coherently within an internet-scale dataspace.  Building upon this foundation her mission is to execute a broad ULS systems research agenda with specific application to intelligence and ultimately inaugurate what she terms “the new discipline of Modern Intelligence” &#8211; the science and practice of intelligence at Ultra-Large Scale.</p>
<p>“I was absolutely delighted to be identified  as one of the top information managers of the year.  It is an honor that reflects the community’s recognition and growing acceptance of the IMI’s breakthrough approach for addressing the scale and diversity of data storage, processing, visualization, and management all within a unified, yet completely open dataspace.” &#8211; Dr. Yoakum-Stover.</p>
<p>For more information, please visit <a href="http://www.imintel.org/">www.imintel.org</a>.</p>
<p><strong>About the Institute for Modern Intelligence (IMI)</strong></p>
<p>The IMI, formed in early 2009,  is a non-profit research institute committed to developing the science, practice, and governance of Modern Intelligence.  IMI is embracing the evolution of the Intelligence Community from its emerging theoretical foundations to the panoply of innovative techniques and the implications for organization and management.  Today, IMI primarily serves the U.S. Army and Intelligence Community.</p>
<p><strong> </strong></p>
<p>Contacts<br />
info@imintel.org</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/iUnJy5zSETM" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/03/31/suzanne-yoakum-stover-among-5-managers-of-the-year/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/03/31/suzanne-yoakum-stover-among-5-managers-of-the-year/</feedburner:origLink></item>
		<item>
		<title>Unified Architecture for Integrating Intelligence Data</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/V4efggOb4Kk/</link>
		<comments>http://imintel.org/blog/2010/02/11/unified-architecture-for-integrating-intelligence-data-2/#comments</comments>
		<pubDate>Thu, 11 Feb 2010 16:42:15 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[Publications]]></category>
		<category><![CDATA[White Papers]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/?p=148</guid>
		<description><![CDATA[Unified Architecture for Integrating Intelligence Data View more presentations from andreweick.]]></description>
			<content:encoded><![CDATA[<div id="__ss_3123620" style="width: 425px; text-align: left;"><a style="font: 14px Helvetica,Arial,Sans-serif; display: block; margin: 12px 0 3px 0; text-decoration: underline;" title="Unified Architecture for Integrating Intelligence Data" href="http://www.slideshare.net/andreweick/unified-architecture-for-integrating-intelligence-data">Unified Architecture for Integrating Intelligence Data</a><object style="margin: 0px;" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" width="425" height="355" codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="src" value="http://static.slidesharecdn.com/swf/ssplayer2.swf?doc=mit2008iqisactual-100210092924-phpapp01&amp;rel=0&amp;stripped_title=unified-architecture-for-integrating-intelligence-data" /><param name="allowfullscreen" value="true" /><embed style="margin: 0px;" type="application/x-shockwave-flash" width="425" height="355" src="http://static.slidesharecdn.com/swf/ssplayer2.swf?doc=mit2008iqisactual-100210092924-phpapp01&amp;rel=0&amp;stripped_title=unified-architecture-for-integrating-intelligence-data" allowscriptaccess="always" allowfullscreen="true"></embed></object></p>
<div style="font-size: 11px; font-family: tahoma,arial; height: 26px; padding-top: 2px;">View more <a style="text-decoration: underline;" href="http://www.slideshare.net/">presentations</a> from <a style="text-decoration: underline;" href="http://www.slideshare.net/andreweick">andreweick</a>.</div>
</div>
<img src="http://feeds.feedburner.com/~r/imintel/~4/V4efggOb4Kk" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/02/11/unified-architecture-for-integrating-intelligence-data-2/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/02/11/unified-architecture-for-integrating-intelligence-data-2/</feedburner:origLink></item>
		<item>
		<title>A Data Integration Framework with Full Spectrum Fusion Capabilities</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/-_tTyBA_O2M/</link>
		<comments>http://imintel.org/blog/2010/02/11/a-data-integration-framework-with-full-spectrum-fusion-capabilities/#comments</comments>
		<pubDate>Thu, 11 Feb 2010 15:07:13 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[Publications]]></category>
		<category><![CDATA[White Papers]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/?p=144</guid>
		<description><![CDATA[Printable Copy A Data Integration Framework with Full Spectrum Fusion Capabilities August 2009 Suzanne Yoakum-Stover, Ph.D. Potomac Institute for Policy Studies, Senior Research Fellow US Army CERDEC I2WD, Information Exploitation Futures Lab, Lead Scientist Fort Monmouth, NJ Tatiana Malyuta, Ph.D. New York City College of Technology, Associate Professor Data Tactics Corp., Principal Database Architect Alexandria, [...]]]></description>
			<content:encoded><![CDATA[<p style="text-align: left; margin-left: 36pt;"><a href="http://imintel.org/blog/wp-content/uploads/2010/02/SC-07-PUB.pdf">Printable Copy</a></p>
<p style="text-align: center; margin-left: 36pt;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr1.png" alt="" /><span style="font-family: Arial; font-size: 16pt;"><strong><br />
</strong></span></p>
<p style="text-align: center; margin-left: 36pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 16pt;"><strong>A Data Integration Framework with Full Spectrum Fusion Capabilities<span style="color: red;"><br />
</span></strong></span></p>
<p style="text-align: center; margin-left: 36pt;">
<p style="text-align: center; margin-left: 36pt;">
<p style="text-align: center; margin-left: 36pt;"><span style="font-family: Times New Roman;">August 2009<br />
</span></p>
<p style="text-align: center; margin-left: 36pt;">
<div>
<table style="border-collapse: collapse;" border="0">
<colgroup>
<col style="width: 206px;"></col>
<col style="width: 206px;"></col>
<col style="width: 206px;"></col>
<col style="width: 206px;"></col>
<col style="width: 191px;"></col>
<col style="width: 205px;"></col>
</colgroup>
<tbody>
<tr>
<td style="padding-left: 1px; padding-right: 1px;">
<p style="text-align: center; margin-left: 4pt;">Suzanne Yoakum-Stover, Ph.D.</p>
<p style="text-align: center; margin-left: 4pt;">
<p style="text-align: center; margin-left: 4pt;">Potomac Institute for Policy Studies, Senior Research Fellow</p>
<p style="text-align: center; margin-left: 4pt;">US Army CERDEC I2WD, Information Exploitation Futures Lab, Lead Scientist</p>
<p style="text-align: center; margin-left: 4pt;">Fort Monmouth, NJ</p>
</td>
<td style="padding-left: 1px; padding-right: 1px;">
<p style="text-align: center; margin-left: 2pt;">Tatiana Malyuta, Ph.D.</p>
<p style="text-align: center; margin-left: 2pt;">
<p style="text-align: center; margin-left: 2pt;">New York City College of Technology, Associate Professor</p>
<p style="text-align: center; margin-left: 2pt;">Data Tactics Corp., Principal Database Architect</p>
<p style="text-align: center; margin-left: 2pt;">Alexandria, VA</p>
</td>
<td style="padding-left: 1px; padding-right: 1px;">
<p style="text-align: center; margin-left: 1pt;">Norbert Antunes</p>
<p style="text-align: center; margin-left: 1pt;">
<p style="text-align: center; margin-left: 1pt;">US Army CERDEC I2WD,</p>
<p style="text-align: center; margin-left: 10pt;">Fusion and Modeling Division,</p>
<p style="text-align: center; margin-left: 1pt;">Computer Engineer</p>
<p style="text-align: center; margin-left: 1pt;">Aberdeen Proving Grounds, MD</p>
</td>
<td style="padding-left: 1px; padding-right: 1px;"></td>
<td style="padding-left: 1px; padding-right: 1px;"></td>
<td style="padding-left: 1px; padding-right: 1px;"></td>
</tr>
</tbody>
</table>
</div>
<p style="text-align: center; margin-left: 36pt;"><span style="font-family: Arial; font-size: 12pt;"><strong>ABSTRACT<br />
</strong></span></p>
<p style="text-align: justify;">One of the most challenging problems in intelligence gathering and processing is resolving the issues of syntactic and semantic interoperability of numerous intelligence sources and fusing the intelligence data, information, and knowledge to provide for efficient, accurate and comprehensive analysis.  The key features of a successful fusion solution are: 1) The ability to rapidly and seamlessly integrate any source while preserving its original data and semantics; 2) Support for powerful data processing capabilities that can utilize the data and semantics of the integrated sources without limitations. Existing data integration approaches require heavy pre-integration processing (schema harmonization and data normalization) and usually entail loss/distortion of original data and semantics. Processing of the integrated data is defined by, and therefore limited by, the integrating schema – discovering data relationships and fusing data beyond the integrating schema is impossible.</p>
<p style="text-align: justify;">We present a unified data and information integration framework that presents absolutely minimal barriers to incorporating new data and semantics into the integrated system (e.g. no heavy pre-processing or data / data-model conditioning), and embraces the full spectrum of data sources, types, models, and modalities (e.g. text, images, audio, signals).</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The approach enables rapid integration of ad-hoc data and data-semantics and results in a multi-layered data store that we call a Unified Data Space (UDS). The UDS supports data fusion to yield information and knowledge while imposing no restrictions on what data must be or how it is to be used, and the diversity of processing by which structural and semantic barriers are overcome.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In this paper we concentrate on the benefits of the approach for data fusion.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong>Key words</strong>: Data Description Framework (DDF), data fusion, data integration, semantic enrichment, structured data.</p>
<h1><span style="font-size: 12pt;">Introduction <span style="color: red;"><br />
</span></span></h1>
<p style="text-align: justify;">One of the most challenging problems in intelligence gathering and processing is resolving the issues of syntactic and semantic interoperability of numerous intelligence sources and fusing the intelligence data, information, and knowledge to provide for efficient, accurate and comprehensive analysis.  The key features of a successful fusion solution are: 1) The ability to rapidly and seamlessly integrate any source while preserving its original data and semantics; 2) Support for powerful data processing capabilities that can utilize the data and semantics of the integrated sources without limitations. Existing data integration approaches require heavy pre-integration processing (schema harmonization and data normalization) and usually entail loss/distortion of original data and semantics. Processing of the integrated data is defined by, and therefore limited by, the integrating schema – discovering data relationships and fusing data beyond the integrating schema is impossible [1 – 3].</p>
<p style="text-align: justify;">We present a unified data and information integration framework that:</p>
<ul>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Presents absolutely minimal barriers to incorporating new data and semantics into the integrated system (e.g. no heavy pre-processing or data / data-model conditioning)<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Embraces the full spectrum of data sources, types, models, and modalities (e.g. text, images, audio, signals)<br />
</span></div>
</li>
</ul>
<p style="text-align: justify;">A detailed description of the architecture and the philosophy of the approach can be found in [4 – 7]. In this paper we concentrate on the benefits of the approach for data fusion:</p>
<ul>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">The UDS permits &#8220;horizontal&#8221; data fusion spanning across data from all sources.  It can accommodate any number of integration models, without imposing physical or semantic barriers, while facilitating navigation, search, and exploration of the integrated data store. As a result, relationships between data from multiple disparate sources that are difficult or impossible to foresee (and therefore impossible to look for) are revealed.<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">The multi-layered UDS permits the &#8220;vertical&#8221; fusion of data and data models, allowing one to backtrack to sources on one hand and connect to knowledge models on the other.  It also supports the processing by which data is cultivated to produce information, knowledge, and ultimately understanding.  Thus, the UDS represents a viable integrated solution that &#8220;matures&#8221; with time.<br />
</span></div>
<p style="margin-left: 10pt;">
<h1><span style="font-size: 12pt;">Challenge of Data Integration<br />
</span></h1>
</li>
</ul>
<p style="text-align: justify;">To be viable within an Ultra-Large Scale (ULS) systems [8] environment consisting of a freely evolving, interdependent collective of human and computational systems, very little of which will ever be under our control, any approach to data and semantic fusion must:</p>
<ul>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Present absolutely minimal barriers to incorporating new data and semantics into the integrated system (e.g. no heavy pre-processing or data / data-model conditioning)<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Embrace the full spectrum of data sources, types, models, and modalities (e.g. text, images, audio, signals)<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Impose no restrictions on what data must be or how it is to be used<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Support the diversity of processing by which structural and semantic barriers are overcome to yield information and knowledge<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Allow data, information, and knowledge to be re-used according to diverse perspectives<br />
</span></div>
</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify;">To our knowledge, no traditional approach to data integration, physical or virtual, has all of these characteristics [1, 2, 9, 10, 12].  Consequently, traditional approaches fail to provide viable solutions in the &#8220;wild&#8221;, i.e. for ULS environments that are characterized by decentralization; inherently conflicting, diverse, and  unknowable requirements; heterogeneous, changing, and inconsistent elements; normal failures; continuous operation, evolution, and deployment; and immense scale along many dimensions.</p>
<p style="text-align: justify;">To productively address the challenges of ULS systems environments, we developed the Data Integration and Semantic Enrichment Platform (DISEP) and the Data Description Framework (DDF) based on the following principles:</p>
<p style="text-align: justify;">To enable true data integration and unencumbered semantic enrichment,</p>
<ul>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Data must be perceived objectively, independent of intended use<br />
</span></div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman;">Domain data-models must be considered from a higher level of abstraction<br />
</span></div>
</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify;"><span style="font-family: Times New Roman;">In our approach, these principles are reflected in two essential decouplings:  The decoupling of data from domain data-models, and the decoupling of domain data-models from the model of the integrated data store.  As a result, the DISEP is able to serve as a universal platform for data integration and semantic enrichment, and within it, the DDF serves as a universal store for structured data.<br />
</span></p>
<p style="text-align: justify;"><span id="more-144"></span></p>
<h1><span style="font-size: 12pt;">Data Description Framework<br />
</span></h1>
<p style="text-align: justify;">The DDF supports semantic data integration by establishing a domain-neutral unified store for structured data.  To achieve this, we consider structure, vocabulary, semantics, and constraints from a higher level of abstraction from which we then distill a minimal set of elements sufficient to capture any data-model.  These are illustrated in Figure 1 and defined as follows:</p>
<p style="text-align: justify;"><strong><em>Sign: </em></strong>A <em>sign</em><br />
<span style="font-family: Arial;">g<sub>i</sub></span> represents a chunk of data, either physically located within a tangible artifact, or contained within an analyst&#8217;s mind.  Examples of the former include a string of text in a document; an area of pixels within an image; a segment of an audio stream or other signal.  As illustrated in Fig. 1, for tangible artifacts, regardless of the type of medium, signs are always associated with a physical extent or quantifiable span within the artifact, which we call a <em>mention</em>.  The set of all signs, <span style="font-family: Arial;">G = {g<sub>i</sub>}</span>, spans across all data sources.  In the set, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;">i,j  (i ≠ j) g<sub>i</sub> ≠ g<sub>j</sub></span>.  <span style="font-family: Arial;">G</span> is the construct by which the DDF represents data.  From the text data shown in Fig. 1, signs <span style="font-family: Arial;">G<span style="font-size: 10pt;">&#8216; </span>= {&#8216;Suzi&#8217;, &#8216;Tanya&#8217;, &#8216;July 4, 2007&#8242;, &#8216;Bring lunch&#8217;, &#8216;Message1&#8242;} </span> contribute to <span style="font-family: Arial;">G</span> (i.e. <span style="font-family: Arial;">G&#8217;<span style="font-size: 10pt;"><br />
</span></span><span style="font-family: Symbol;">Í</span><span style="font-family: Arial;"> G</span>), though many more signs may be identified even from this simple example.</p>
<p style="text-align: justify;"><strong><em>Concept:</em><br />
</strong> A <em>concept</em><br />
<span style="font-family: Arial;">c<sub>i</sub><br />
</span>is an abstract idea, defined either explicitly or implicitly by a source data-model.  For example, the nodes of an ontology, the tag set in an XML Schema Document (XSD), and the attribute / table names in a relational database all represent concepts.  In the set of all concepts <span style="font-family: Arial;">C = {c<sub>i</sub>}</span>, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;">i,j  (i ≠ j) c<sub>i</sub> ≠ c<sub>j</sub></span>.  From the text data shown in Fig. 1, concepts <span style="font-family: Arial;">C&#8217; = {&#8216;Message&#8217;, &#8216;Person&#8217;, &#8216;Body_text&#8217;}</span> contribute to the full set of concepts <span style="font-family: Arial;">C </span>(i.e. <span style="font-family: Arial;">C&#8217; </span><span style="font-family: Symbol;">Í</span><span style="font-family: Arial;"> C</span>).</p>
<p style="text-align: justify;"><strong><em>Predicate:</em></strong> A <em>predicate</em> p<sub>i</sub> is an abstract idea used to express a relationship between &#8220;things.&#8221; Predicates are used in the formation of <em>statements </em>(described below) and may be defined either explicitly or implicitly by a source data-model.  For example, the arcs of an ontology, and the attributes of an XML or database schema represent <em>predicates</em>.   In the set of all predicates <span style="font-family: Arial;">P = {p<sub>i</sub>}</span>, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;">i,j  (i ≠ j) p<sub>i</sub> ≠ p<sub>j</sub></span>.  The text example of Fig. 1 contributes predicates <span style="font-family: Arial;">P&#8217; = {&#8216;To&#8217;, &#8216;From&#8217;, &#8216;Body&#8217;}<span style="font-size: 10pt;"><br />
</span></span>to the set of all predicates<span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>P<span style="font-size: 10pt;"><br />
</span></span>(i.e. <span style="font-family: Arial;">P&#8217; </span><span style="font-family: Symbol;">Í</span><span style="font-family: Arial;"> P</span>).  The only predicate that is &#8220;built into&#8221; (i.e. defined by) our storage model is the <span style="font-family: Arial;"><span style="font-size: 10pt;">&#8216;</span>IsInstanceOf<span style="font-size: 10pt;">&#8216;</span></span> predicate<em>, </em>which is used to disambiguate <em>signs</em> to form <em>terms</em> as described below.  Concepts and predicates are the constructs by which we link to data-models and, thereby, explicitly expose data-semantics.</p>
<p style="text-align: justify;"><strong><em>Term: </em></strong> A <em>term</em><br />
<span style="font-family: Arial;">t<sub>i</sub></span> is an ordered pair <span style="font-family: Arial;">&lt;g<sub>i</sub>,c<sub>j</sub>&gt;</span> where <span style="font-family: Arial;">g<sub>i</sub><br />
</span><span style="font-family: MS Mincho;">∈</span><span style="font-family: Arial;"> G<span style="font-size: 10pt;"><br />
</span></span>and <span style="font-family: Arial;">c<sub>j</sub><br />
</span><span style="font-family: MS Mincho;">∈</span><span style="font-family: Arial;"> C<span style="font-size: 10pt;">.</span></span> Each term represents a disambiguated <em>sign</em>.  The process of disambiguation associates a <em>sign</em> with a <em>concept</em> using the <span style="font-family: Arial;">&#8216;IsInstanceOf&#8217;</span><br />
<em>predicate</em> (though not every sign from <span style="font-family: Arial;">G</span> is necessarily disambiguated, and not every concept from <span style="font-family: Arial;">C</span> is necessarily used for disambiguation).  In the set of all terms <span style="font-family: Arial;">T = {t<sub>ij</sub>}</span>, each element is unique:  <span style="font-family: Courier New; font-size: 10pt;">∀</span><span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>i,j,k,l (i ≠ k or j ≠ l) t<sub>ij</sub> ≠ t<sub>kl</sub></span>.  The text example of Fig. 1 contributes terms <span style="font-family: Arial;">T&#8217; = {t<sub>1</sub>, t<sub>2</sub>, t<sub>3</sub>, t<sub>4</sub>}</span><br />
<img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr2.png" alt="" />where <span style="font-family: Arial;">t<sub>1</sub> = &lt;&#8217;Suzi&#8217;, person&gt;, t<sub>2</sub> = &lt;&#8217;Tanya&#8217;, person&gt;,  t<sub>3</sub> = &lt;&#8217;Bring lunch&#8217;, Body_text&gt;, t<sub>4</sub> = &lt;Message1, message&gt;<span style="font-size: 10pt;"><br />
</span></span>to the complete set of terms<span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>T<span style="font-size: 10pt;"> (i.e. </span>T&#8217; </span><span style="font-family: Symbol;">Í</span><span style="font-family: Arial;"> T<span style="font-size: 10pt;">).<br />
</span></span></p>
<p style="text-align: justify;"><strong><em>Statement: </em></strong>A <em>statement</em>, <span style="font-family: Arial;">s<sub>i</sub></span> is an encodes a binary relationship between a subject and an object mediated by a predicate<em>. </em> A statement is represented by an ordered triple <span style="font-family: Arial;">s<sub>ijh</sub> = &lt;subject<sub>i</sub>, predicate<sub>j</sub>, object<sub>h</sub>&gt;</span>.  Among the set of all statements, each element is unique: <span style="font-family: Courier New; font-size: 10pt;">∀</span><span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>i,j,h,l,m,n (i ≠ l or j ≠ m or h ≠ n) s<sub>ijh</sub> ≠ s<sub>lmn</sub></span>.  In our model, subject and object may be either a <em>term</em> or <em>statement</em>.  The simplest kind of <em>statement</em> is one in which subject and object are <em>terms </em><span style="font-family: Arial;">s0<sub>ijh</sub> = &lt;t<sub>i</sub>, p<sub>j</sub>, t<sub>h</sub>&gt;</span>.  <em>Statements</em> in which the object is itself another <em>statement</em> represent reifications: <span style="font-family: Arial;">s1<sub>klm</sub> = &lt;t<sub>k</sub>, p<sub>l</sub>, s<sub>m</sub>&gt;.</span> Finally, a <em>statement</em> in which both subject and object are other <em>statements</em> represents a relationship between <em>statements</em>: <span style="font-family: Arial;">s2<sub>xyz</sub> = &lt;s<sub>x</sub>, p<sub>y</sub>, s<sub>z</sub>&gt;</span>.  The set of all statements <span style="font-family: Arial;">S = {s0<sub>ijh</sub>} U {s1<sub>klm</sub>} U {s2<sub>xyz</sub>}</span>.  The text example of Fig. 1 shows three <em>statements</em>: <span style="font-family: Arial;">S&#8217; = {&lt;t<sub>4</sub>, to, t<sub>1</sub>&gt;, &lt;t<sub>4</sub>, from, t<sub>2</sub>&gt;, &lt;t<sub>4</sub>, body, t<sub>3</sub>&gt;}</span> all with the same subject, which is the <em>term</em> corresponding to the message itself.  These statements contribute to the set of all statements, i.e. <span style="font-family: Arial;">S&#8217; </span><span style="font-family: Symbol;">Í</span><span style="font-family: Arial;"> S<span style="font-size: 10pt;">.</span></span></p>
<p style="text-align: justify;">These elementary constructs (sign, concept, predicate, term, and statement) define a data reference model, which we call the Data Description Framework (DDF) [13].  Because it effectively decouples data from data-models, it can encapsulate any sort of data-model.  Because it binds knowledge to data, it enables deep data integration and semantic enrichment.  By using the DDF as the basis for implementing a stable storage-model, we are able to build a practical data integration platform on commodity database infrastructure.</p>
<p style="text-align: justify;">The reader familiar with the Resource Description Framework (RDF/RDFS) may wonder what is different here. Indeed, RDF and DDF share DNA, so to speak, since both employ a similar level of abstraction and expose semantics. Unlike RDF however, DDF also prescribes the exposure of data as signs which can freely participate in the disambiguations and associations necessary for data integration. In contrast, a datum represented as an RDF literal cannot be explicitly disambiguated or associated. Also, in contrast to DDF signs, which provide a primal level of data integration (to be described below), there is no mechanism in RDF to prevent a single datum from being represented by multiple URIs.  This is not a criticism of RDF as these differences reflect the fact that RDF is a meta-model not specifically aimed at data integration. Thus, employing RDF for data integration necessitates building a particular metamodel instance (i.e. a model) in RDF along with rules prescribing the manner of data exposure [3]. In contrast, DDF is a model that makes explicit commitments to support data integration. Because this model represents an abstraction over domain data-models, the DDF can represent data structured by any data-model, and be represented in any metamodel (including RDF).</p>
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr3.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr4.png" alt="" />As illustrated in Figure 2, the DDF forms a layer of data and semantics (Layer 2) in the DISEP that lies between the indigenous source systems (Layer 1) and their data/knowledge models (Layer 3). A more detailed description of the layered architecture is presented in [13].   Layer 1 feeds the layers above, and Layers 2 and 3 interact:  Layer 3 provides the semantic context for Layer 2, and Layer 2 participates in the formation of an overarching knowledge model in Layer 3.   Together Layers 2 and 3 form what we call the Unified Data Space (UDS).</p>
<h1><span style="font-size: 12pt;">Demonstration of Fusing Potential<br />
</span></h1>
<p style="text-align: justify;">The DDF fusing potential is demonstrated in Fig. 3:</p>
<ul>
<li>
<div style="text-align: justify;">The UDS integrated (Fig. 4 and Fig. 5) selected data from Freebase [16] (structured data), GovTrack [17] (semi-structured data), and the images and articles from Wikipedia [18, 19] (unstructured data).</div>
</li>
<li>
<div style="text-align: justify;">The UDS absorbed the diversity of the data types and modalities: audio, images, unstructured text, and data from the relational database.</div>
</li>
<li>
<div style="text-align: justify;">Accommodating data from the structured and semi-structured stores was performed in an automated fashion with little to no up-front preprocessing that is usually needed in traditional integration approaches.</div>
</li>
<li>
<div style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr5.png" alt="" />The unstructured text data and images were represented with the help of DDF constructs via manual or automated entity extraction and face-recognition processes.</div>
</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify; margin-left: 2pt;">
<p style="text-align: justify; margin-left: 2pt;">
<p style="text-align: justify; margin-left: 2pt;">
<p style="text-align: justify; margin-left: 2pt;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr6.png" alt="" />Figures 4 and 5 illustrate the process of &#8220;DDFying&#8221; the data sources from Fig. 3: extracting and disambiguating data, and representing disambiguated data as terms (Fig. 4); associating disambiguated data as statements (Fig. 5).</p>
<p style="text-align: justify; margin-left: 2pt;">
<p style="text-align: justify; margin-left: 2pt;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr7.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr8.png" alt="" /><span style="background-color: silver;"><br />
</span></p>
<h2>Horizontal Fusion Across Sources</h2>
<p style="text-align: justify;">Utilizing data from multiple sources is complicated by their original disconnect &#8211; each data store is built on some data scope and on a specific data-model, and over time, neither the scope of data nor the data-model change much. A traditional integration approach, even a successful one, results in yet another store with its own data and data-models, with the limited ability to evolve &#8211; often another candidate for future integration efforts. Significant investment into the integration project may result in the &#8220;rise&#8221; of a new data store but cannot prevent its &#8220;fall.&#8221; Fig. 6a) demonstrates a progression of integrated stores &#8211; isolated silos of data disconnected from each other and from the data sources.</p>
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr9.png" alt="" />The Unified Data Space that is built and cultivated on the DDF model allows to incorporate any data source without sacrificing the source&#8217;s original data and data-model &#8211; DDF unlocks the data source to virtually endless enrichment by different cross-source disambiguations and associations. The DDF UDS is permanently and invariably open to new data sources, and is constantly evolving with its data and data-model expanding. Each participating data store contributes to the expressiveness of the whole Data Space, and in turn benefits from being fused with other data stores by gaining richer semantic and data contents (Fig. 6b).</p>
<p style="text-align: justify;">Using the &#8220;fusion&#8221; metaphor &#8211; a traditional integration approach produces a finished product &#8211; an alloy, while the DDF approach produces a melting pot.</p>
<p style="text-align: justify;">The evolving DDF Data Space is representative of the sources and the results of cross-source enrichment.  It also takes data utilization and exploitation to a new level and supports navigation, exploration, and querying, without limits.</p>
<p style="text-align: justify;">In the subsequent text, we represent signs, concepts, and predicates using <span style="font-family: Arial;">Arial</span> font.  Terms are denoted as<span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>[sign, concept] <span style="font-size: 10pt;">(</span></span>e.g.<span style="font-family: Arial;"><span style="font-size: 10pt;"><br />
</span>[Adam, Chemist]<span style="font-size: 10pt;">) </span></span>and statements are denoted using an intuitive triple representation, e.g. <span style="font-family: Arial;">[Adam, Chemist] hasInventoryID [1001,InventoryID]<span style="font-size: 10pt;">.</span></span><br />
<strong><br />
</strong></p>
<p style="text-align: justify;">
<h2><span style="font-family: Times New Roman; font-size: 11pt;">Fig. 7 demonstrates these capabilities for the following scenario:<br />
</span></h2>
<ul>
<li>
<div style="text-align: justify;">
<h2><span style="font-size: 11pt;"><span style="font-family: Times New Roman;">A user (or a process) performed search for the text (i.e. sign) &#8216;Bush&#8217; and retrieved a number of terms ([</span>George Walker Bush, Name<span style="font-family: Times New Roman;">] from Wikipedia, [</span>George W. Bush, President<span style="font-family: Times New Roman;">] from Freebase, …) visualized as the nodes.<br />
</span></span></h2>
</div>
</li>
<li>
<div style="text-align: justify;">The user asserts that ([<span style="font-family: Arial;">George Walker Bush, Name</span>] from Wikipedia is same as [<span style="font-family: Arial;">George W. Bush, President</span>] from Freebase &#8211; a new statement [<span style="font-family: Arial;">George Walker Bush, Name</span>] <span style="font-family: Arial;">sameAs</span> [<span style="font-family: Arial;">George W. Bush, President</span>] is created in the UDS and is shown as an arc connecting the two nodes. The figure shows the results of. The dashed ellipses show how the multiple cross-source assertions fuse data from different sources.</div>
</li>
<li>
<div style="text-align: justify;">Furthermore, the fused data allows us to break the sources barriers and see the cross-source relationships between data as if they came from the same source. For example, [<span style="font-family: Arial;">George Walker Bush, Name</span>] from Wikipedia (ellipse 2) will be associated with [<span style="font-family: Arial;">Dick Cheney, Vice_President</span>] from Freebase (ellipse 1).</div>
</li>
<li>
<div style="text-align: justify;">The user can semantically enrich the UDS by introducing new DDF constructs, such as signs, terms, and statements. For example,  the user asserts that [<span style="font-family: Arial;">BUSH George Walker, Face</span>] from one Wikipedia article is associated with [<span style="font-family: Arial;">Bush George Herbert Walker, Face</span>] from another Wikipedia article via predicate <span style="font-family: Arial;">hasFather</span> (green arc).</div>
</li>
<li>
<div style="text-align: justify;">Following the original and asserted relationships between data the user can navigate across the sources, e.g. from ellipse 1 to ellipse 4 in the figure.</div>
<p style="margin-left: 18pt;">
</li>
</ul>
<p><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr10.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr11.png" alt="" /></p>
<h2>Vertical Fusion Across Data and Knowledge</h2>
<p style="text-align: justify;">The DDF defines the organization of the Layer 2 of DISEP (Fig. 2) and establishes connections with its other layers &#8211; the DISEP provides for a contiguous multi-layer store: from sources in the wild, to the structured data store, and to the knowledge representation.</p>
<p style="text-align: justify;">Data is taken from Layer 1, tied with the semantic elements from Layer 3 to get the structured data store, on which we can perform efficient data processing and enrichment. However, in DDF we may lose the rich data context of the data element from Layer 1, so when we need to perform data analysis in the original data context, we can go back to Layer 1. We also may lose the rich semantic context for the semantic element, so when we need to perform model analysis, we can go to Layer 3. Fig. 8 demonstrates contiguousness of the UDS and vertical enrichments processes.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr12.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr13.png" alt="" /></p>
<p style="text-align: justify;">Fig. 9 illustrates these enrichment processes on our example. After [<span style="font-family: Arial;">George Walker Bush, Name</span>] from Wikipedia is fused with [<span style="font-family: Arial;">George W. Bush, President</span>] from Freebase, a user (a process) can see in Layer 3 that the concept <span style="font-family: Arial;">President</span> is associated with the concept <span style="font-family: Arial;">Party</span> and he returns to the Wikipedia source to find out that George Walker Bush is a Republican. A new term [<span style="font-family: Arial;">Republican, Party</span>] and a new statement [<span style="font-family: Arial;">George Walker Bush, Name</span>] <span style="font-family: Arial;">associatedWith</span> [<span style="font-family: Arial;">Republican, Party</span>] are created in the Layer 2, enriching the integrated data store that supports exploration, navigation and querying.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The enrichment operations preserve the contiguousness of the UDS: from Layer 1 we can &#8220;see&#8221; how a particular data element is associated with any abstraction of Layer 3, and from Layer 3 we can &#8220;see&#8221; how any abstract idea is associated with data. We truly break the barriers not only between the sources, but also between the data and knowledge worlds.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr14.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1506_ADataIntegr15.png" alt="" /></p>
<h1><span style="font-size: 12pt;">Summary <span style="color: red;"><br />
</span></span></h1>
<p style="text-align: justify;">It is acknowledged that there is no silver bullet for solving the problem of data integration [7], and that all integration approaches face deep challenges associated with scale, performance, query processing, data conditioning / pre-processing, semantic enrichment, viability, and sustainability.  The DISEP and DDF serve to address these challenges as follows:</p>
<p style="text-align: justify;"><strong><em>Scalability</em></strong>. The challenge of scale is common to most integrated stores.  The &#8220;lossless&#8221; data representation of the DDF slightly exacerbates this problem because it generally requires several times more storage space than in the original source.  Fortunately, distributed database technologies, and cloud computing infrastructure in particular, provide viable means to manage this challenge.</p>
<p style="text-align: justify;"><strong><em>Query Processing and Semantic Exploration</em></strong>. The DDF enables semantic navigation over the DDF Data Space by the action of a series of questions that &#8220;surf&#8221; across the entire DDF Data Space unimpeded by barriers between source systems. We distinguish two types of navigation and data retrieval on the DDF Data Space:</p>
<p style="text-align: justify;"><em>Exploration</em>. A user navigates the Data Space having limited or no knowledge of the sources and their models. Navigation is data-driven; the result of the initial user&#8217;s query is used to generate the next .</p>
<p style="text-align: justify;"><em>Querying</em> is similar to querying of traditional data sources when a user formulates a request assuming knowledge of a  data-model. Querying is model-driven; each query is independent from others.</p>
<p style="text-align: justify;">A very important consequence of the unified representation of data and data-semantics in DDF is the unified structure of the queries. This allows querying patterns to be defined and processing optimized.  As a result, the ad-hoc querying and exploration of the data store can be performed naturally [15]. Ad-hoc querying of traditional integrated stores has serious performance issues and is constrained by the integration model. Moreover, some queries, such as search over data values, which are natural in the DDF, cannot be implemented in a practical way using other integration solutions.</p>
<p style="text-align: justify;"><strong><em>Semantic enrichment</em></strong>. The scope of semantic enrichment achievable by the DDF is defined by the integration actions that it supports: incorporating new data and data-models, disambiguating data, and building associations between disambiguated data. Because traditional data integration approaches support only some aspects of data and data-model enhancement, their semantic enrichment power is severely limited. In addition, the DDF implementation supports extensive metadata [14] (not discussed in this paper), which may provide additional semantic enrichment by, for example, capturing information quality. Metadata also enable more sophisticated information retrieval processes.</p>
<p style="text-align: justify;"><strong><em>Viability</em></strong>. One of the most important advantages of the DDF over other integration solutions is that it meets the challenge of the viability of an integration solution.  The DDF accommodates new data and semantics and allows for virtually endless semantic enhancement through new data disambiguations (i.e. term formation) and new semantics associations (i.e. statements formation).</p>
<p style="text-align: justify;"><strong><em>Sustainability</em></strong>. An integration solution must &#8220;…offer some services immediately without any setup time, and improve the services as more investment is made into creating semantic relationships.&#8221; [7]. The DDF actually achieves this:  Data sources can be integrated in the DDF without heavy preprocessing or data-model harmonization. The Data Space can be explored, and semantic relationships discovered, without a-priori understanding of source data-models. Additional refinement and enrichment of the Data Space serves to increase the effectiveness of Data Space services.</p>
<h1><span style="font-size: 12pt;">Implementation: General Overview<br />
</span></h1>
<p style="text-align: justify;">A DDF data store can be implemented in a variety of ways (e.g. objects, relations, triples).  We have a prototype in Oracle 10g and there is an ongoing effort of implementing  DDF on the Cloud.  The following table gives an overview of the DDF architecture on the Cloud.</p>
<p style="text-align: center;"><span style="font-size: 10pt;"><strong><span style="font-family: Times New Roman;">Table 1.  Architecture of the DDF Integration Solution</span><span style="color: red; font-family: Times New Roman Bold;"><br />
</span></strong></span></p>
<div>
<table style="border-collapse: collapse;" border="0">
<colgroup>
<col style="width: 60px;"></col>
<col style="width: 114px;"></col>
<col style="width: 138px;"></col>
<col style="width: 168px;"></col>
<col style="width: 145px;"></col>
</colgroup>
<tbody>
<tr style="height: 20px;">
<td style="padding-left: 7px; padding-right: 7px; border: solid black 0.5pt;" rowspan="2">
<p style="text-align: center;"><strong>Layer</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid black 0.5pt; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;" rowspan="2">
<p style="text-align: center;"><span style="font-size: 10pt;"><strong>Description<br />
</strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid black 0.5pt; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;" rowspan="2">
<p style="text-align: center;"><strong>Content</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid black 0.5pt; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;" colspan="2">
<p style="text-align: center;"><strong>Content Origin</strong></p>
</td>
</tr>
<tr style="height: 20px;">
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid black 0.5pt; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">
<p style="text-align: center;"><strong>System/Organization</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">
<p style="text-align: center;"><strong>Process</strong></p>
</td>
</tr>
<tr style="height: 76px;">
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid black 0.5pt; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">
<p style="text-align: center; margin-left: 5pt;">3 &#8211; Model Description Framework (MDF)</p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Universal store</p>
<p>for data / knowledge models</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Model elements and their relationships</p>
<p>(e.g. concept, predicate, super/sub class, part-of, property, &#8230;)</p>
<p>HBase</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any Information System</p>
<ul>
<li>Relational schemas</li>
<li>Taxonomies</li>
<li>RDF ontologies</li>
<li>&#8230;</li>
</ul>
<p>Any project/organization</p>
<ul>
<li>Cyc project</li>
<li>DARPA, Gov&#8217;t labs, DoD, IC, ..</li>
<li>Standards bodies</li>
<li>&#8230;</li>
</ul>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any process</p>
<ul>
<li>Data model harmonization</li>
<li>Data model integration</li>
<li>Data model management</li>
<li>Data model enrichment</li>
<li>&#8230;</li>
</ul>
</td>
</tr>
<tr style="height: 76px;">
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid black 0.5pt; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">
<p style="margin-left: 5pt;">2 &#8211; Data Description Framework (DDF)</p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Universal store</p>
<p>for structured data</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Data elements, semantic elements, and associations</p>
<p>Includes artifact and process metadata</p>
<p>(e.g. source, creator, timestamp, &#8230;)</p>
<p>HBase</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any System</p>
<ul>
<li>Relational database</li>
<li>Object store</li>
<li>XML source</li>
<li>Triple store</li>
<li>Key-value store</li>
<li>&#8230;</li>
</ul>
<p>Any Organization</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any process</p>
<ul>
<li>Object extraction</li>
<li>text, image, voice, signature, &#8230;</li>
<li>Natural language processing</li>
<li>Link anaysis</li>
<li>Analyst manual activity</li>
<li>Alerting, reporting, &#8230;</li>
<li>Other tools and applications</li>
</ul>
</td>
</tr>
<tr style="height: 76px;">
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid black 0.5pt; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">
<p style="margin-left: 5pt;">1- Artifact Description Framework (ADF)</p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Universal store</p>
<p>for  unstructured artifacts</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Indiginous Artifacts</p>
<p>(documents, images, audio, video, signals, &#8230; )</p>
<p>Hadoop File System</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any Source</p>
<ul>
<li>External filesystem</li>
<li>Web</li>
<li>
<div>Document repository</div>
<p>&#8230;</li>
</ul>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid black 0.5pt; border-right: solid black 0.5pt;">Any process</p>
<ul>
<li>Reporting</li>
<li>Alerting</li>
<li>Security</li>
<li>&#8230;</li>
</ul>
</td>
</tr>
</tbody>
</table>
</div>
<h1><span style="font-size: 12pt;">Conclusion<br />
</span></h1>
<p style="text-align: justify;">The DISEP and DDF support data fusion and unencumbered semantic enrichment by implementing the following key principles:</p>
<ul>
<li>
<div style="text-align: justify;">Data must be perceived objectively, independent of intended use</div>
</li>
<li>
<div style="text-align: justify;">Domain data-models must be considered from a higher level of abstraction</div>
</li>
</ul>
<p style="text-align: justify;">Data fusion approaches, physical and virtual, generally manage to preserve only a portion of the original data and semantics, and present these with yet another single, restrictive data-model. In contrast, the DDF persists and presents the entirety of the source data and semantics by using a <em>higher level abstraction </em>that imposes no particular data-model, yet supports any. The DDF populated with data produces a Unified Data Space that represents the primal integrated data layer of the DISEP.  Within this Data Space, the original data and data-models co-exist and may be enriched either through the ingestion and integration of additional data or semantic enhancement.</p>
<p style="text-align: justify;">Over decades of data processing, we have been formalizing our perception of data and then transforming (or even creating) and storing data according to this perception. Unfortunately, there has been very little effort to ensure correctness/durability/objectivity of those perceptions. As a result, we work with numerous models and formats of data, and numerous versions of data buried beneath. The evolution of our perception and understanding of data cannot be reflected in these data stores. New data, which does not conform to the store&#8217;s model, also cannot be accommodated. Thus we are trapped in an endless loop of creating and integrating new data stores, each of which deals with only a fraction of the data surrounding us. None of these can be expanded to represent other data, and all are valid for a relatively short time. In DDF, on the other hand, data &#8220;lives&#8221; alongside data-models, not inside them. This enables loose coupling of data and perceptions (i.e. data-models), and allows multiple perceptions to co-exist in the Data Space.</p>
<p style="text-align: justify;">Without imposing modifications on existing data stores, the DDF can expose their data and semantics for use and re-use, without further increasing data entropy. The DDF Data Space is a live integrated store that evolves with our intentions (i.e. applications) and perceptions (i.e. data-models).</p>
<h1><span style="font-size: 12pt;">Acknowledgements<br />
</span></h1>
<p style="text-align: justify;"><span style="font-size: 12pt;">The authors thank the following US Army CERDEC I2WD personnel for their continued support:  Mr. Anthony Lisuzzo, Director, Mr. Kesny Parent, DCGS-A Branch Chief, and Ms. Virginia Goon IXFL Manager.  The authors also thank Mr. Oscar Wood and Mr. David Salmen of Data Tactics Corporation, as well as Mr. Andrew Eick of MissionFoc.us for many productive and stimulating discussions. This work was funded by US Army CERDEC I2WD under contract number W15P7T-06-D-A401/009.<br />
</span></p>
<h1><span style="font-size: 12pt;">References<br />
</span></h1>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Times New Roman; font-size: 12pt;">[1] Batini, C. et al. A comparative analysis of methodologies for database schema integration. <em>ACM Computing Surveys, (18) 4, 1986.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[2] Bernstein P., and Ho, H<span style="font-family: Arial;">. </span>Model Management and Schema Mappings: Theory and Practice. <em>Proceedings of VLDB Conference, 2007.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[3] Booth, D. <em>Why URI Declarations? A comparison of architectural approaches. HP Software, 2008</em>. http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-422/irsw2008-submission-9.pdf<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[4] Franklin, M., Halevy, A., and Maier, D. From Databases to Dataspaces: A New Abstraction for Information Management. <em>ACM SIGMOD Record, 2005.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Times New Roman; font-size: 12pt;">[5] Halevy, A. et al. Enterprise information integration: successes, challenges and controversies. <em>Proceedings of 24th International Conference on Management of Data, Baltimore, 2005.<br />
</em></span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[6] Halevy, A. Franklin, M., and Maier, D. Principles of Dataspace Systems. <em>Proceedings of the twenty-fifth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems, 2006.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;"><span style="color: black;">[7] Halevy, A., </span>Rajaraman<span style="color: black;">, A., and </span>Ordille, J.<span style="color: black;"><br />
</span>Data Integration: The Teenage Years. <em>Proceedings of VLDB Conference, 2006.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[8]  Northrop, L., et al., Ultra-Large-Scale Systems The Software Challenge of the Future.  <em>Pittsburgh: Carnegie Mellon University, 2007.<br />
</em></span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[9]  Omelayenko, B. and Fensel, D.  An Analysis of B2B Catalogue Integration Problems.  <em>Proceedings of the International Conference on Enterprise Information Systems (ICEIS-2001),</em><br />
<em>2001</em>.<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Times New Roman; font-size: 12pt;">[10] Parent, C. and Spaccapietra, S. Issues and approaches of database integration. <em>Communications of the ACM, 41(5), 1998.<br />
</em></span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[11] Sowa, J. Knowledge Representation. Logical, Philosophical, and Computational Foundations. <em>Brooks/Cole, 2000</em>.<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[12] Yero, J. <span style="color: black;">Logical vs. Physical Data Integration: A Practical Decision Guide.  <em>The DAMA International Symposium &amp; Wilshire Meta-Data Conference. San-Diego, CA, 2008</em>.<br />
</span></span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;"> [13] Yoakum-Stover, S. and Malyuta, T. <em>Unified Architecture for Integrating Intelligence Data</em>, <em>Proceedings of MIT Information Quality Industry Symposium, MIT, Cambridge, MA, 2008.</em><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[14] Yoakum-Stover, S. and Malyuta, T. <em>Unified Integration Architecture for Intelligence Data</em>, <em>Proceedings of DAMA International Europe Conference, London, UK, 2008</em>.<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[15] Yoakum-Stover, S. and Malyuta, T. <em>Unified Data Integration for Situation Management</em>, <em>Proceedings of the 4th IEEE Workshop on Situation Management (SIMA2008) at MILCOM 2008, San Diego CA, 2008.<br />
</em></span></p>
<p style="text-align: center; margin-left: 18pt;"><span style="font-size: 12pt;">Data Sources used in demonstration:<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-size: 12pt;">[16] Freebase. <a href="http://www.freebase.com/">http://www.freebase.com/</a><br />
</span></p>
<p>[17] GovTrack.us: Tracking the U.S. Congress. <a href="http://www.govtrack.us"><span style="font-size: 12pt;">http://www.govtrack.us</span></a>/</p>
<p>[18] George W. Bush. <em>Wikipedia, The Free Encyclopedia</em>. <a href="http://en.wikipedia.org/w/index.php?title%20=%20George_W._Bush&amp;oldid=300819162">http://en.wikipedia.org/w/index.php?title = George_W._Bush&amp;oldid=300819162</a></p>
<p>[19] Bill Clinton. <em>Wikipedia, The Free Encyclopedia</em>.  <a href="http://en.wikipedia.org/w/index.php?title%20=%20George_W._Bush&amp;oldid=300819162">http://en.wikipedia.org/w/index.php?title = Bill_Clinton&amp;oldid=301113520</a><span style="color: #a1a1a1; font-family: Arial;"><br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">
<p style="margin-left: 28pt;">
<p style="text-align: justify;"><span style="font-family: Arial; font-size: 12pt;"><strong><br />
</strong></span></p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/-_tTyBA_O2M" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/02/11/a-data-integration-framework-with-full-spectrum-fusion-capabilities/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/02/11/a-data-integration-framework-with-full-spectrum-fusion-capabilities/</feedburner:origLink></item>
		<item>
		<title>Unified Architecture for Integrating Intelligence Data</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/tIYhzQJ338o/</link>
		<comments>http://imintel.org/blog/2010/02/10/unified-architecture-for-integrating-intelligence-data-3/#comments</comments>
		<pubDate>Wed, 10 Feb 2010 14:59:13 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[Publications]]></category>
		<category><![CDATA[White Papers]]></category>

		<guid isPermaLink="false">http://imintel.org/blog/2010/02/10/rename-post/</guid>
		<description><![CDATA[The principal problem spanning the Intelligence Community today is how to integrate the great variety of disparate data into one single coherent repository of knowledge. Current practice whereby all data-models would be merged into a single "Uber-model" simply does not work. We require a solution that remains viable in a freely evolving, interdependent collective of human and computational systems, very little of which will ever be under our control. Our approach is database-centric and proceeds in stages. The first addresses the unified representation of the broad spectrum of artifacts existing within the Intelligence Enterprise today regardless of modality or structure. The second builds upon the foundation provided by the first to address the unified storage of structured data and semantic data integration. In both we embrace the diversity of data-models employed throughout the Intelligence Community. The result is a layered data architecture that can accommodate any kind of data without placing restrictions on vocabulary, structure, semantics, or constraints in a way that addresses today's intelligence needs while providing a seamless transition path toward a future of Ultra-Large Scale (ULS) systems imbued with semantic technologies.]]></description>
			<content:encoded><![CDATA[<p><a href="http://imintel.org/blog/wp-content/uploads/2010/02/JDIQ20091.pdf">Printable Copy</a></p>
<p><em>ACM Journal of Data and Information Quality. Pending decision</em></p>
<blockquote>
<h2>Authors</h2>
<p>Suzanne Yoakum-Stover, Ph.D.<br />
Institute for Modern Intelligence, Executive Director<br />
Alexandria, VA</p>
<p>Tatiana Malyuta, Ph.D.<br />
Data Tactics Corp., Principal Database Architect</p>
<p>Alexandria, VA<br />
New York City College of Technology, Associate Professor</p></blockquote>
<h2>Abstract</h2>
<p style="text-align: justify; margin-left: 36pt;"><span style="font-size: 10pt;">The principal problem spanning the Intelligence Community today is how to integrate the great variety of disparate data into one single coherent repository of knowledge.  Current practice whereby all data-models would be merged into a single &#8220;Uber-model&#8221; simply does not work.  We require a solution that remains viable in a freely evolving, interdependent collective of human and computational systems, very little of which will ever be under our control.  Our approach is database-centric and proceeds in stages.  The first addresses the unified representation of the broad spectrum of artifacts existing within the Intelligence Enterprise today regardless of modality or structure.  The second builds upon the foundation provided by the first to address the unified storage of structured data and semantic data integration.  In both we embrace the diversity of data-models employed throughout the Intelligence Community. The result is a layered data architecture that can accommodate any kind of data without placing restrictions on vocabulary, structure, semantics, or constraints in a way that addresses today&#8217;s intelligence needs while providing a seamless transition path toward a future of Ultra-Large Scale (ULS) systems imbued with semantic technologies.<br />
</span></p>
<p style="text-align: justify; margin-left: 36pt;">
<h2>Introduction</h2>
<p style="text-align: justify;">The principal problem spanning the Intelligence Community today is how to integrate the great variety of disparate data stores and streams, both legacy and bleeding-edge, into one coherent repository of knowledge.  Pieces of the intelligence puzzle lay scattered in data silos sequestered by the very systems that served to create them.  Each of these systems, to include most of today&#8217;s Army Programs of Record, was built as an end-to-end solution with its own sensors, processors, and data stores, implemented and operated to achieve a specific intelligence objective.  They were never meant to interoperate, share data, or even expose data beyond a narrow mission-focused enclave.  The advent of network technologies and protocols, which have effectively eliminated the physical barriers between systems, has done little to bridge the chasm between these data silos.  Although we can now transfer data over the wire, disparate and utterly incompatible data-models characterized by straightforward and subtle differences in vocabulary, structure, semantics, and constraints continue to stymie data search, exploration, enrichment, and exploitation efforts.  The fundamental problems of data integration remain to be solved.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Current practice in data integration, whereby all data-models would be merged or harmonized, either physically or virtually [Batini 1986, <span style="font-family: Times New Roman;">Parent 1998, Halevy 2005, </span>Bernstein 2007] fails to accommodate the demands of our fluid and rapidly growing Intelligence Enterprise.  The physical mapping of disparate models into a single canonical data-model [Omelayenko 2001] is simply untenable as the scale and complexity of their subjects quickly overwhelms our tools and methods.  Federation approaches share this defect and introduce new ones [Izydor 2007, Yero 2008].   In practice, these approaches provide only the illusion of data integration as they mainly integrate data-models, not the data itself, and in so doing confine all data to a model that is incapable of adapting itself or its contents as our knowledge about the domain evolves.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In all but the most constrained situations, what begins as a perfectly neat solution for a handful of systems quickly becomes intractable with scale, exposing not only the limitations of traditional approaches, but also of our grasp at the foundations of knowledge representation itself.  This phenomenon is but one early symptom of our evolution toward Ultra-Large Scale (ULS) systems [Northrop 2006] and as such, invites a completely different approach &#8211; one that remains viable in a freely evolving, interdependent collective of data sources / types / modalities / models, analytics, tools, interfaces, mission applications, perspectives, and users, very little of which will ever be under our control.  Our objective is to define such a solution.</p>
<h2>Conceptual Approach</h2>
<p style="text-align: justify;">Our approach to integrating intelligence data in a ULS systems environment is data-centric (as opposed to data-model centric) and proceeds in stages.  The first addresses the unified storage of the entire spectrum of intelligence artifacts regardless of modality or representation.  The second stage builds upon the foundation provided by the first to address the unified storage of structured data to enable semantic data integration. A third stage (beyond the scope of this paper) addresses unified storage of knowledge models. In all stages we embrace the diversity of domain-specific data-models employed throughout the Intelligence Community by taking a data-model agnostic approach wherein the persistence model makes the least possible commitment to any particular data-model.  In the case of &#8220;raw&#8221; artifacts, this means storing each according to its native representation without the application of structural or semantic transformations. In the case of structured artifacts, it means: a) perceiving data as first-class citizens by de-coupling them from the data-model and b) using an abstract model for the unified representation and persistence of the artifact and integration semantics. A key aspect of our approach is that the character and meaning of the source data and data-model is preserved and made accessible by the data store.  The result is a <em>layered</em> Data Architecture and Semantic Integration Framework that can accommodate any kind of <em>data</em> and <em>semantics</em> without placing restrictions on vocabulary, structure, semantics, or constraints, in a way that addresses the needs of the Intelligence Community today while providing a seamless transition path toward a future of ULS systems imbued with semantic technologies.</p>
<h2><span id="more-115"></span>Scope</h2>
<p style="text-align: justify;">The types of intelligence collected by sensors and systems today span the electro-magnetic spectrum to include all manner of signals, audio, video, and images, in addition to so-called human intelligence (e.g. text artifacts such as reports, messages, web pages).  Our approach to data integration supports all of these simultaneously regardless of their underlying source data-model, or lack thereof.  It does not however, <em>prescribe</em> a solution for data-model harmonization.  In particular, our approach imposes no relationship between the data-models to which the artifacts adhere.  It does however, allow such relationships, created by external processes of any sort, to be effectively represented and persisted together.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost12.png" alt="" />Whereas the <em>business</em> of intelligence is to develop and communicate understanding (which entails the collection, exploitation, and provisioning of intelligence), intelligence business <em>processing</em> includes any automated activity that moves intelligence artifacts with respect to the cognitive hierarchy (see Fig.1a).  This includes data collection, semantic enhancement, and fusion from data to information to knowledge, and communication / collaboration to create understanding.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Our Data Architecture and Semantic Integration Framework mirrors both the structure of the cognitive hierarchy and the operations of intelligence business processing.  Built atop a collection of indigenous artifacts (see Fig. 1b), Layer 1 of the Framework supports an aspect of collection and rudimentary exploitation of artifact semantics.  Layer 2 supports the processing by which data extracted from artifacts is enhanced with semantics to produce information, and the processing by which information is enhanced with richer associations to produce knowledge.   Layer 3 supports the management and integration of knowledge models employed by Layers 1 and 2.  Finally, Layer 4 supports human computer interfaces through which the analyst &#8220;sees&#8221; all of this intelligence.  This paper focuses on Layers 1 and 2, which together support the provisioning of integrated intelligence at the level of data, information, and knowledge.  Layers 3 and 4 will be the subject of a subsequent paper.</p>
<h2>Layer 1</h2>
<p style="text-align: justify;">The broad and ever-changing spectrum of intelligence artifacts existing within the Intelligence Enterprise today reflects a nearly equally broad and ever-changing spectrum of intelligence collectors, producers, and consumers.  The types of artifacts they generate vary tremendously in their modality (e.g. text, images, audio, video, signals), structure (e.g. relational, object, key-value) and representation (e.g. free text, XML, SQL, vector, raster).  As this diversity is beyond our control, we term all such artifacts as &#8220;indigenous&#8221; and the diversity of the external data stores and systems in which they reside as the &#8220;wild.&#8221;</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Layer 1 of our data integration framework addresses the integration of the entire spectrum of artifacts existing in the wild by simply collecting them together in a unified data store.  The decision to bring an artifact into the Framework is recorded in Layer 1 by persisting a reference to the artifact along with a minimal set of essential meta-data whose main purpose is: a) to support analysis of artifact content; and b) to provide access to the artifact from the higher layers of the framework (see Fig. 2).  The original artifact may also be physically captured in an indigenous data store that sits below Layer 1, however this is not required. In addition to the artifact reference and metadata, the Layer 1 schema also supports associations between artifacts.  Any kind of relationship between artifacts can be represented since the set of predicates used to express them is not pre-defined.  As described subsequently, predicates are persisted in Layer 3 of the Framework.</p>
<p style="text-align: justify;">
<p><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost21.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost31.png" alt="" /></p>
<h2>Layer 2</h2>
<h3>Structured Data</h3>
<p style="text-align: justify;">Every analyst engaged in intelligence processing either creates or uses structured data.  Just as we do not control the sources or format of indigenous artifacts, we also do not control the various methods by which such artifacts might be structured or the data-models employed therein.  Thus as the objective of Layer 1 is to represent the diversity of indigenous artifacts regardless of type or format, the objective of Layer 2 is to accommodate the diversity of all structured data regardless of vocabulary, organization, representation, or semantics.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Structured data necessarily adheres to some sort of model, which in general specifies vocabulary, organization, semantics, and constraints.  Though not all data-models specify all of these, at minimum, every structured artifact entails a vocabulary reflecting a set of entity types (e.g. person, message) and an organization reflecting their relationships (e.g. message to person).  These basic elements are illustrated in the simplified example of Fig. 3.  Part (a) of the figure shows a short unstructured text message, and part (b) shows a data-model according to which a message might be structured.  Part (c) then shows the original message structured according to the data-model and part (d) shows how that structured message might typically be persisted in a database.</p>
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost41.png" alt="" /></p>
<p style="text-align: justify;">Notice how the database schema is tightly coupled to the data-model that was used to structure the data, and how the raw message is bound to the data-model by the database.  In effect, the data-model is imposed on the database, and the data itself is frozen into it such that no additional attributes or relationships are possible (without modifying the database schema).  This is a severe shortcoming considering the tremendous variety of ways in which a given artifact might be structured or enhanced with additional attributes and associations.  Even for the simple case shown in the figure, we can easily imagine data-models that use different entities (e.g. <span style="font-family: Arial; font-size: 10pt;">&#8216;Individual&#8217;</span> instead of <span style="font-family: Arial; font-size: 10pt;">&#8216;Person&#8217;</span>), different relationships (e.g. <span style="font-family: Arial; font-size: 10pt;">&#8216;Sender&#8217;</span> instead of <span style="font-family: Arial; font-size: 10pt;">&#8216;From&#8217;</span>), and different organizations (e.g. by including <span style="font-family: Arial; font-size: 10pt;">&#8216;MessageDate&#8217;</span>), not to mention the wealth of other information external to the message itself (e.g. about <span style="font-family: Arial; font-size: 10pt;">&#8216;Tanya&#8217;</span>) that might be brought to bear.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In a ULS systems environment, it is simply unreasonable to presume that the data-models or the various processes, either automated or manual, that structure data can be controlled or constrained.  It is also unreasonable to presume that it is possible to anticipate the totality of their breadth or their application. To the contrary, the urgency and diversity driving our Intelligence Enterprise essentially guarantees that as many different methods for extracting entities, relationships, and events will be brought to bear as our imaginations and increasingly powerful technologies can support.  Thus, although we might like to enhance Layer 1 of our Data Architecture and Semantic Integration Framework by exposing all possible extracted elements along with their properties and attributes in order to support efficient information retrieval and broad application, introducing an ever expanding array of fields and tables into a database is as impractical as attempting to accommodate every kind of data and purpose within a single canonical data-model.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The challenge therefore, is to build the next layer of the Framework to accommodate structured data in a way that exposes that structure for use, without imposing the structure on the data store itself.  In other words, we must determine a method for storing and managing any kind of structured data, reflecting any data-model, so that it can be shared, efficiently exploited, and extended in unforeseen ways without requiring model-specific storage implementations.  In other words, we seek a universal, domain-neutral storage model for structured data.</p>
<h3>Data-Model Abstraction</h3>
<p style="text-align: justify;">The key to devising a domain-neutral storage model for structured data is to decouple what varies, namely vocabularies and, more generally the data-models, from that which remains constant, namely the source artifact, and ideally the storage structure.  To achieve this, we consider structure, vocabulary, semantics, and constraints from a higher level of abstraction from which we then distill a minimal set of elements sufficient to capture any data-model.  These are defined as follows:</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Sign: </em></strong>A <em>sign, </em><span style="font-family: Arial; font-size: 10pt;">g</span>, is a representation of a chunk of data, either physically located within a tangible artifact, or contained within an analyst&#8217;s mind.  Examples of the former include a string of text in a document; an object within an image; a segment of audio in an audio stream; a spike in a signal.  As illustrated in Fig. 4, regardless of the type of medium, a sign for tangible data is always associated with a physical extent within the artifact and has a quantifiable span, which we call a <em>mention</em>. In contrast, signs that reside in an analyst&#8217;s mind become tangible only when she writes down her thoughts.  We explicitly include such intangible signs here to support the analyst&#8217;s ability to assert information directly into the data store without having to first represent it in a physical artifact.  The set of all signs, <span style="font-family: Arial; font-size: 10pt;">G = {g<sub>i</sub>}</span>, spans across all data sources.  In the set, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-size: 10pt;"><span style="font-family: Arial;">i,j  (i<sub><br />
</sub>≠ j) </span><span style="font-family: Wingdings;">à</span><span style="font-family: Arial;"> (g<sub>i </sub>≠ g<sub>j</sub>)</span></span>.  <span style="font-family: Arial; font-size: 10pt;">G</span> is the construct by which data are represented.  From the text data shown in Fig. 4, signs <span style="font-family: Arial; font-size: 10pt;">G&#8217; = {&#8216;Suzi&#8217;, &#8216;Tanya&#8217;, &#8216;July 4, 2007&#8242;, &#8216;Bring lunch&#8217;, &#8216;Message1&#8242;</span>}  contribute to <span style="font-family: Arial; font-size: 10pt;">G</span> (i.e. <span style="font-size: 10pt;"><span style="font-family: Arial;">G&#8217; </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">G</span>), though many more signs may be identified even from this simple example.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost51.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost61.png" alt="" /></p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Concept:</em><br />
</strong> A <em>concept,</em><br />
<span style="font-family: Arial; font-size: 10pt;">c</span>, is a representation of an abstract idea, defined explicitly or implicitly by a source data-model.  For example, the nodes of an ontology, the tag set in an XML Schema Document (XSD), and the attribute / table names in a relational database all represent concepts. In the set of all concepts <span style="font-family: Arial; font-size: 10pt;">C = {c<sub>i</sub>}</span>, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-size: 10pt;"><span style="font-family: Arial;">i,j  (i<sub><br />
</sub>≠ j) </span><span style="font-family: Wingdings;">à</span><span style="font-family: Arial;"> (c<sub>i </sub>≠ c<sub>j</sub>)</span></span>.  From the text data shown in Fig. 4, concepts <span style="font-family: Arial; font-size: 10pt;">C&#8217; = </span>{<span style="font-family: Arial; font-size: 10pt;">&#8216;Message&#8217;</span>, <span style="font-family: Arial; font-size: 10pt;">&#8216;Person&#8217;</span>, <span style="font-family: Arial; font-size: 10pt;">&#8216;Body_text&#8217;}</span> contribute to the full set of concepts <span style="font-family: Arial; font-size: 10pt;">C</span> (i.e. <span style="font-size: 10pt;"><span style="font-family: Arial;">C&#8217; </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">C</span>).</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Predicate:</em></strong> A <em>predicate,</em><br />
<span style="font-family: Arial; font-size: 10pt;">p</span>, is a representation of an abstract idea used to express a relationship between &#8220;things.&#8221;  Predicates are used in the formation of <em>statements </em>(described below) and may be defined either explicitly or implicitly by a source data-model.  For example, the arcs of an ontology, and the attributes of an XML or database schema represent predicates.   In the set of all predicates <span style="font-family: Arial; font-size: 10pt;">P = {p<sub>i</sub>}</span>, each element is unique: <span style="font-family: MS Mincho;">∀</span><span style="font-size: 10pt;"><span style="font-family: Arial;">i,j  (i<sub><br />
</sub>≠ j) </span><span style="font-family: Wingdings;">à</span><span style="font-family: Arial;"> (p<sub>i </sub>≠ p<sub>j</sub>)</span></span>.  The text example of Fig. 4 contributes predicates <span style="font-family: Arial; font-size: 10pt;">P&#8217; =</span> {<span style="font-size: 10pt;">&#8216;<span style="font-family: Arial;">To&#8217;</span></span>, <span style="font-family: Arial; font-size: 10pt;">&#8216;From&#8217;,</span><br />
<span style="font-family: Arial; font-size: 10pt;">&#8216;Body&#8217;} </span>to the set of all predicates<span style="font-family: Arial; font-size: 10pt;"> P </span>(i.e. <span style="font-size: 10pt;"><span style="font-family: Arial;">P&#8217; </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">P</span>).  The only predicate that is &#8220;built into&#8221; (i.e. defined by) our storage model is the <span style="font-family: Arial; font-size: 10pt;">&#8216;IsInstanceOf&#8217;</span> predicate<em>, </em>which is used to disambiguate <em>signs</em> to form <em>terms</em> as described below.  Concepts and predicates are the constructs by which we link to data-models and, thereby, explicitly expose data-semantics.</p>
<p style="text-align: justify;"><strong><em>Term: </em></strong> A <em>term,</em><br />
<span style="font-family: Arial; font-size: 10pt;">t<sub>ij</sub></span>,<sub><br />
</sub>is an ordered pair &lt;<span style="font-family: Arial; font-size: 10pt;">g<sub>i</sub>,c<sub>j</sub></span>&gt; where <span style="font-size: 10pt;"><span style="font-family: Arial;">g<sub>i </sub></span><span style="font-family: MS Mincho;">∈</span><span style="font-family: Arial;"> G </span></span>and <span style="font-size: 10pt;"><span style="font-family: Arial;">c<sub>j </sub></span><span style="font-family: MS Mincho;">∈</span><span style="font-family: Arial;"> C</span></span>.  Each term represents a disambiguated <em>sign</em>.  The process of disambiguation associates a <em>sign</em> with a <em>concept</em> using the <span style="font-family: Arial; font-size: 10pt;">&#8216;IsInstanceOf&#8217;</span><br />
<em>predicate</em> (though not every sign from <span style="font-family: Arial; font-size: 10pt;">G</span> is necessarily disambiguated, and not every concept from <span style="font-family: Arial; font-size: 10pt;">C</span> is necessarily used for disambiguation).  In the set of all terms <span style="font-family: Arial; font-size: 10pt;">T = {t<sub>ij</sub>}</span>, each element is unique:  <span style="font-size: 10pt;"><span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;"> i,j,k,l</span></span><span style="font-family: MS Mincho;"><br />
</span><span style="font-family: Arial; font-size: 10pt;">(i ≠ k</span> or<span style="font-family: MS Mincho;"><br />
</span><span style="font-family: Arial; font-size: 10pt;">j ≠ l)</span><br />
<span style="font-family: Wingdings;">à</span> (<span style="font-family: Arial; font-size: 10pt;">t<sub>ij</sub> ≠ t<sub>kl</sub></span>).  The text example of Fig. 4 contributes terms <span style="font-family: Arial; font-size: 10pt;">T&#8217; = {t<sub>1</sub>, t<sub>2</sub>, t<sub>3</sub>, t<sub>4</sub>}</span> where <span style="font-family: Arial; font-size: 10pt;">t<sub>1</sub> = &lt;&#8217;Suzi&#8217;, person&gt;, t<sub>2</sub> = &lt;&#8217;Tanya&#8217;, person&gt;,  t<sub>3</sub> = &lt;&#8217;Bring lunch&#8217;, Body_text&gt;, t<sub>4</sub> = &lt;Message1, message&gt; </span>to the complete set of terms<span style="font-size: 10pt;"><span style="font-family: Arial;"> T (i.e. T&#8217; </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">T).<br />
</span></p>
<p style="text-align: justify;"><strong><em>Statement: </em></strong>A <em>statement</em>, <span style="font-family: Arial; font-size: 10pt;">s</span>, encodes a binary relationship between a subject and an object mediated by a predicate<em>. </em> A statement is represented by an ordered triple <span style="font-family: Arial; font-size: 10pt;">s<sub>ijh</sub> = &lt;subject<sub>i</sub>, predicate<sub>j</sub>, object<sub>h</sub>&gt;</span>.  Among the set of all statements, each element is unique: <span style="font-size: 10pt;"><span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;"> i,j,h,l,m,n</span></span><span style="font-family: MS Mincho;"><br />
</span><span style="font-family: Arial; font-size: 10pt;">(i ≠ l</span> or <span style="font-family: Arial; font-size: 10pt;">j ≠ m</span> or <span style="font-family: Arial; font-size: 10pt;">h ≠ n)</span><br />
<span style="font-family: Wingdings;">à</span> (<span style="font-family: Arial; font-size: 10pt;">s<sub>ijh </sub>≠ s<sub>lmn</sub></span>).  In our model, subject and object may be either a <em>term</em> or <em>statement</em>.  The simplest kind of <em>statement</em> is one in which subject and object are <em>terms </em><span style="font-family: Arial; font-size: 10pt;">s0<sub>ijh </sub>= &lt;t<sub>i</sub>, p<sub>j</sub>, t<sub>h</sub>&gt;</span>.  <em>Statements</em> in which the object is itself another <em>statement</em> represent reifications: <span style="font-family: Arial; font-size: 10pt;">s1<sub>klm </sub>= &lt;t<sub>k</sub>, p<sub>l</sub>, s<sub>m</sub>&gt;</span>.  Finally, a <em>statement</em> in which both subject and object are other <em>statements</em> represents a relationship between <em>statements</em>: <span style="font-family: Arial; font-size: 10pt;">s2<sub>xyz </sub>= &lt;s<sub>x</sub>, p<sub>y</sub>, s<sub>z</sub>&gt;</span>.  The set of all statements <span style="font-family: Arial; font-size: 10pt;">S = {s0<sub>ijh</sub>} U {s1<sub>klm</sub>} U {s2<sub>xyz</sub>}</span>.  The text example of Fig. 4 shows three <em>statements</em>: <span style="font-family: Arial; font-size: 10pt;">S&#8217; = {&lt;t<sub>4</sub>, to, t<sub>1</sub>&gt;, &lt;t<sub>4</sub>, from, t<sub>2</sub>&gt;, &lt;t<sub>4</sub>, body, t<sub>3</sub>&gt;}</span> all with the same subject, which is the <em>term</em> corresponding to the message itself.  These statements contribute to the set of all statements, i.e. <span style="font-size: 10pt;"><span style="font-family: Arial;">S&#8217; </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">S.</span></p>
<p style="text-align: justify;">
<p style="text-align: justify;">Note that the above definitions are formulated to be clear and unambiguous with respect to our particular approach and may not match those found in other literature.  Throughout the paper, we will denote instances of signs, concepts, predicates, terms, and statements using Arial font within single quotes (e.g. <span style="font-family: Arial; font-size: 10pt;">&#8216;person&#8217;</span>).</p>
<h3>DDF</h3>
<p style="text-align: justify;">Abstracted from the milieu of all possible data-models, these elementary constructs (concept, predicate, sign, term, and statement) provide the fixed-points of a data reference model that will ultimately form the basis of a practical Data Architecture and Semantic Integration Framework which we call the Data Description Framework (DDF).     Despite its simplicity, the DDF is an amazingly rich model that can be viewed from at least two different perspectives.  From one perspective, the DDF encompasses a synergistic combination of two higher order models lying along different dimensions of abstraction – one that is outward-looking (&#8220;extrospective&#8221;), one inward-looking (introspective).</p>
<p style="text-align: justify;">
<p style="text-align: justify;">
<p style="text-align: justify;">The &#8220;extrospective&#8221; portion of the model is a meta-model formed by (a) <span style="font-family: Courier New;">C</span> and <span style="font-family: Courier New;">P</span>, which look outward to domain knowledge (represented in data / knowledge models), and (b) <span style="font-family: Courier New;">G</span>, which looks outward toward the data.  Signs bring data into the DDF as first class entities which may then participate in various, unlimited conceptualizing relationships created by any sort of automated or manual process at any time.  Signs provide a fundamental level of <em>data</em> integration (that traditional approaches lack) resulting from having eliminated data-model barriers.  Concepts and predicates are to domain knowledge what signs are to data.  They are the mechanism by which such knowledge (typically encoded in domain-specific data / knowledge models) is linked into the DDF and exposed by our Data Integration Framework for use and re-use.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The introspective portion of the model is a semantic model formed by <span style="font-family: Courier New;">T</span> and <span style="font-family: Courier New;">S</span> which abstract data-model internals to expose structure in a uniform way.  Terms bind signs to concepts, exposing the meaning of the data unambiguously with respect to the original source data-model.  Statements represent semantic relationships about, within, and between disambiguated data elements.</p>
<p style="text-align: justify;">Together the introspective and &#8220;extrospective&#8221; models that comprise the DDF enable both horizontal and vertical <em>data</em> integration. The &#8220;extrospective&#8221; abstraction bridges data and domain knowledge (vertical integration). The instrospective abstraction bridges data structured by various disparate processes (horizontal integration) and binds the two outward looking faces of the &#8220;extrospective&#8221; model to provide a comprehensive data integration model.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost71.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost81.png" alt="" /></p>
<p style="text-align: justify;">
<p style="text-align: justify;">From the second perspective, the DDF may be regarded as a synergistic combination of two interaction patterns – one that decouples, one that binds.  DDF achieves decoupling in two ways.  First, as a higher order data-model abstraction, DDF effectively decouples data from <em>data-models</em>.  Thus, the DDF can encapsulate any sort of data regardless of the source data-model.  Second, as a higher order <em>data-structure</em>, DDF effectively decouples structured data from data storage structures.  Thus, the DDF can accommodate any data regardless of the source storage structure.  As a result, the DDF provides a practical foundation for implementing a stable database that can accommodate any sort of structured data.</p>
<p style="text-align: justify;">The ways in which DDF implements binding are illustrated in Fig. 5.   Specifically, sign <span style="font-family: Arial; font-size: 10pt;">g</span> binds with concept <span style="font-family: Arial; font-size: 10pt;">c</span> to form term <span style="font-family: Arial; font-size: 10pt;">t</span>,<sub><br />
</sub>and predicate <span style="font-family: Arial; font-size: 10pt;">p</span><sub><br />
</sub>binds with term <span style="font-family: Arial; font-size: 10pt;">t</span><sub><br />
</sub>to form statement <span style="font-family: Arial; font-size: 10pt;">s</span>.  The diagram also indicates that a predicate may bind a term and a statement to form reification or a predicate may bind a statement with another statement to form a statement relationship.  These bindings allow data to be integrated within and across data-models and continuously enriched into knowledge.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Together these interaction patterns make the DDF a powerful yet practical platform for data integration.  Decoupling gives DDF the character of a universal data store and successive bindings progressively move intelligence artifacts (or their constituent elements) upward through the cognitive hierarchy.  The result is a universal data integration and semantic enrichment platform that supports data structured by any means, unrestricted associations within and between them, and increasingly rich semantics.</p>
<h4>Representational Power</h4>
<p style="text-align: justify;">Although the expressiveness of the DDF is sufficient to capture the data and data-semantics of any structured data source, we illustrate this for the relational model since it is the most commonly used.  Similar arguments can be made for other model types, such as hierarchical, object-oriented, and graph.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In accordance with common relational formalism [Date 2004], a relation <span style="font-family: Arial; font-size: 10pt;">R </span>is defined by the set of attributes <span style="font-family: Arial; font-size: 10pt;">A = {A<sub>i</sub>} (1 ≤ i ≤ n)</span>.  The subset of attributes that comprise the primary key are denoted as <span style="font-family: Arial; font-size: 10pt;">K={K<sub>l</sub>} (1 ≤ l ≤ k)</span>,  <span style="font-size: 10pt;"><span style="font-family: Arial;">K </span><span style="font-family: Symbol;">Í</span></span><br />
<span style="font-family: Arial; font-size: 10pt;">A</span>.  The set of all data values in <span style="font-family: Arial; font-size: 10pt;">R</span> is <span style="font-size: 10pt;"><span style="font-family: Arial;">D = {d<sub>ij</sub>},</span><br />
</span>where <span style="font-family: Arial; font-size: 10pt;">d<sub>ij</sub></span> is a value on the intersection of attribute <span style="font-family: Arial; font-size: 10pt;">A<sub>i</sub></span> and row <span style="font-family: Arial; font-size: 10pt;">W<sub>j </sub>(1≤ j ≤ m).</span> We can integrate data and its original semantics from <span style="font-family: Arial; font-size: 10pt;">R</span> into a DDF data space consisting of <span style="font-family: Arial; font-size: 10pt;">G<sub>0</sub></span>, <span style="font-family: Arial; font-size: 10pt;">C<sub>0</sub></span>, <span style="font-family: Arial; font-size: 10pt;">P<sub>0</sub></span>, <span style="font-family: Arial; font-size: 10pt;">T<sub>0</sub></span>, and S<sub>0</sub> according to the following procedure:</p>
<p style="text-align: justify;">
<ul>
<li>
<div style="text-align: justify;">All attributes of <span style="font-family: Arial; font-size: 10pt;">R</span> are added to the set of concepts:</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">C = C<sub>0</sub> U A<br />
</span></p>
</li>
<li>
<div style="text-align: justify;">Non-key attributes are added to the set of predicates:</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">P = P<sub>0</sub> U (A &#8211; K)</span></p>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Arial; font-size: 10pt;">D&#8217; = {d&#8217;<sub>i</sub>}</span> is the set of unique values of <span style="font-family: Arial; font-size: 10pt;">D</span>: <span style="font-size: 10pt;"><span style="font-family: MS Mincho;">∀</span><span style="font-family: Arial;">i,j  (i<sub><br />
</sub>≠ j) d<sup>&#8216;</sup><sub>i </sub>≠ d<sup>&#8216;</sup><sub>j</sub></span></span> . The values in <span style="font-family: Arial; font-size: 10pt;">D&#8217;</span> that are not already present in <span style="font-family: Arial; font-size: 10pt;">G<sub>0</sub></span> are added to the set of signs:</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">G = G<sub>0</sub> U (D&#8217;– G<sub>0</sub>)</span></p>
</li>
<li>
<div style="text-align: justify;">We build the set of terms <span style="font-family: Arial; font-size: 10pt;">T<sub>R</sub> =</span> {<span style="font-family: Arial; font-size: 10pt;">t<sub>ij</sub></span>} where <span style="font-family: Arial; font-size: 10pt;">t<sub>ij</sub>=&lt;d<sub>ij</sub>, A<sub>i</sub>&gt; </span>and <span style="font-family: Arial; font-size: 10pt;">1 ≤ i ≤ n</span>, <span style="font-family: Arial; font-size: 10pt;">1≤ j ≤ m</span>. <span style="font-family: Arial; font-size: 10pt;">T&#8217;<sub>R</sub></span> is the subset of unique terms of <span style="font-family: Arial; font-size: 10pt;">T<sub>R</sub></span>. Terms of <span style="font-family: Arial; font-size: 10pt;">T&#8217;<sub>R</sub></span> are added to <span style="font-family: Arial; font-size: 10pt;">T<sub>0</sub></span>.</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">T = T<sub>0</sub> U T&#8217;<sub>R</sub></span></p>
</li>
<li>
<div style="text-align: justify;">We build the set of statements <span style="font-family: Arial; font-size: 10pt;">S<sub>R</sub> =</span> {<span style="font-family: Arial; font-size: 10pt;">s<sub>ij</sub></span>} where <span style="font-family: Arial; font-size: 10pt;">s<sub>ij</sub> = &lt; &lt;d<sub>kj</sub>, K&gt;, A<sub>i</sub>, &lt;d<sub>ij</sub>, A<sub>i</sub>&gt; &gt;</span>, <span style="font-family: Arial; font-size: 10pt;">d<sub>kj</sub></span> represents the combination of values of the key attributes for the row <span style="font-family: Arial; font-size: 10pt;">W<sub>j</sub></span>, <span style="font-family: Arial; font-size: 10pt;">A<sub>i</sub></span><br />
<span style="font-family: Symbol; font-size: 10pt;">Í</span><br />
<span style="font-family: Arial; font-size: 10pt;">A</span>-K, and <span style="font-family: Arial; font-size: 10pt;">k+1 ≤ i ≤ n</span>, <span style="font-family: Arial; font-size: 10pt;">1≤ j ≤ m</span>. Statements of  <span style="font-family: Arial; font-size: 10pt;">S<sub>R</sub></span> are added to <span style="font-family: Arial; font-size: 10pt;">S<sub>0</sub></span>:</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">S = S<sub>0</sub> U S<sub>R</sub></span></p>
</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify;">Representation of <span style="font-family: Arial; font-size: 10pt;">R</span> in DDF is lossless (no loss or distortion of data and semantics, even though semantics of <span style="font-family: Arial; font-size: 10pt;">R</span> is not explicitly represented in DDF) because we can restore <span style="font-family: Arial; font-size: 10pt;">R</span> from DDF:</p>
<ol>
<li>
<div style="text-align: justify;"><span style="font-family: Arial; font-size: 10pt;">R</span> is contained in statements <span style="font-family: Arial; font-size: 10pt;">S</span>, therefore, using processing metadata (described in the following section and shown in Fig. 6), extract from <span style="font-family: Arial; font-size: 10pt;">S</span> the statements that originated from <span style="font-family: Arial; font-size: 10pt;">R</span>:</div>
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Arial; font-size: 10pt;">S<sub>R</sub> =</span> {<span style="font-family: Arial; font-size: 10pt;">s<sub>ij</sub></span>} where <span style="font-family: Arial; font-size: 10pt;">s<sub>ij</sub> = &lt; &lt;d<sub>kj</sub>, K&gt;, C &lt;d<sub>ij</sub>, A<sub>i</sub>&gt; &gt;<br />
</span></p>
</li>
<li>
<div style="text-align: justify;">From <span style="font-family: Arial; font-size: 10pt;">S<sub>R</sub></span> restore the structure and rows of <span style="font-family: Arial; font-size: 10pt;">R</span> as follows:</div>
</li>
</ol>
<div>
<table style="border-collapse: collapse;" border="0">
<colgroup>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
</colgroup>
<tbody>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border: solid 0.5pt;">
<p style="text-align: center;"><span style="text-decoration: underline;"><strong>K</strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k+1</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k+2</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><strong>…</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>n</sub></strong></span></p>
</td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>n1</sub></span></td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>n2</sub></span></td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>km</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,m</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,m</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>nm</sub></span></td>
</tr>
</tbody>
</table>
</div>
<p style="text-align: justify;">The process that was used to build combinations of values of the key attributes can be reversed to get to the relation in its original form:</p>
<div>
<table style="border-collapse: collapse;" border="0">
<colgroup>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
<col style="width: 65px;"></col>
</colgroup>
<tbody>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><strong>. . .</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k+1</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>k+2</sub></strong></span></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><strong>…</strong></p>
</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: solid 0.5pt; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">
<p style="text-align: center;"><span style="font-family: Arial; font-size: 10pt;"><strong>A<sub>n</sub></strong></span></p>
</td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>11</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">. . .</span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,1</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>n1</sub></span></td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>12</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">. . .</span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,2</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>n2</sub></span></td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">. . .</span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">. . .</span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
</tr>
<tr>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: solid 0.5pt; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>1m</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">. . .</span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>km</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+1,m</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>k+2,m</sub></span></td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;">. . .</td>
<td style="padding-left: 7px; padding-right: 7px; border-top: none; border-left: none; border-bottom: solid 0.5pt; border-right: solid 0.5pt;"><span style="font-family: Arial; font-size: 10pt;">d<sub>nm</sub></span></td>
</tr>
</tbody>
</table>
</div>
<p style="text-align: justify;">
<p style="text-align: justify;">Therefore, by the integration procedure described above, the data and data-semantics from <span style="font-family: Arial; font-size: 10pt;">R</span> are faithfully represented with the DDF.  The structure of <span style="font-family: Arial; font-size: 10pt;">R</span> itself and its identity integrity are explicitly captured in Layer 3.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">This procedure further reveals two powerful and <em>distinguishing</em> features of the DDF:</p>
<ul>
<li>
<div style="text-align: justify;">The DDF can accommodate data and data-semantics from structured sources without loss or distortion.</div>
</li>
<li>
<div style="text-align: justify;">Structured artifacts can be integrated within the DDF in a mechanical fashion without requiring prior knowledge, and or analysis of, their domain-specific data-models.</div>
</li>
</ul>
<p style="text-align: justify;">
<h4>Integration Power</h4>
<p style="text-align: justify;">The DDF exposes four levels of data and semantics (signs, terms, statements, and concepts and predicates), which support four levels of <em>largely independent</em> integration actions or patterns: establishing signs, disambiguation, association, and data-model enhancement:</p>
<ul>
<li>
<div style="text-align: justify;">Signs are established from mentions asserted by users or data ingest process.  Many mentions may relate to the same sign, and it is this &#8220;re-use&#8221; of signs that provides a primal level data integration.</div>
</li>
<li>
<div style="text-align: justify;"><span style="font-family: Times New Roman; font-size: 9pt; text-decoration: underline;">Data-model enhancement occurs when </span>processes operating on Layer 3 extend, enhance, or harmonize data-models associated with the incorporated data sources.  Since these operations do not affect the original data-models, but serve to establish new overarching data-models, multiple perspectives can co-exist (e.g. both the original and enhanced).</div>
</li>
<li>
<div style="text-align: justify;">Disambiguation actions create associations between signs and concepts, either when data is ingested into the DDF, or by a subsequent semantic process.  The same sign may be disambiguated in any number of different ways and signs and concepts may be associated regardless of their originating data source. In other words, the sign and the concept to which it is bound may originate from disparate sources.</div>
</li>
<li>
<div style="text-align: justify;">Finally, association actions create binary statements between terms and or statements using a predicate.  These may originate from semantic relationships expressed within a data source or may be created by a subsequent semantic process. Any term / statement may be associated to any other regardless of their origin.</div>
</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify;">By virtue of these integration actions, the DDF is able to support an essentially endless process of semantic enrichment and each of these actions may be conducted on any integration level without necessarily affecting the other elements.</p>
<h4>Evolutionary Power</h4>
<p style="text-align: justify;">The independence of the integration actions imbues the DDF with extraordinary evolutionary power. New sources, modifications of the integrated data sources, and changes of integration models can be accommodated without requiring the data space to be rebuilt. In particular, the growth of a source system, both in terms of data and semantics, can be accommodated by the addition of new DDF elements (signs, terms, statements, concepts, and predicates) associated with that growth.  Source system modifications can be handled in several ways depending on how the integrated store is used.  Because the dependencies between all of the elements in the Framework are known, we can always define how to proliferate / manage changes and the approaches employed may vary from a source to a source.  Finally, new integration models can be introduced by the integration actions described previously and these will simply co-exist with all other models. If an integration model needs to be modified, we can introduce it as a new model or mark the changed elements appropriately and proliferate those changes.</p>
<h3>DDF in Relationship to RDF</h3>
<p style="text-align: justify;">The reader familiar with the Resource Description Framework (RDF/RDFS) may wonder what is different here. Indeed, RDF and DDF share DNA, so to speak, since both employ a similar level of abstraction and make semantics explicit. Unlike RDF however, DDF also prescribes the exposure of data as signs which can freely participate in the disambiguations and associations necessary for data integration. In other words, data represent themselves directly as signs and participate in the DDF as first-class citizens. Because of the way they are designed, DDF signs provide a primal level of data integration. In contrast, data in RDF are represented either as literals or by URIs. A datum represented as an RDF literal cannot be explicitly disambiguated or associated. Furthermore, because the URI is a first-class citizen, not the datum, there is no mechanism in RDF to prevent a single datum from being represented by multiple URIs and or literals.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">This is not a criticism of RDF as these differences reflect the fact that DDF is an abstract model aimed at data integration, whereas, RDF is a meta-model. Thus, employing RDF for data integration necessitates building a model in RDF (i.e. a particular meta-model instance) along with rules prescribing the manner of data exposure. In contrast, DDF is a model that makes explicit commitments to support data integration. Because this model represents an abstraction over domain data-models, the DDF can represent data structured by any data-model, and be represented in any meta-model (including RDF). Fig. 6 illustrates the place of DDF in relation to the models and meta-models.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost91.png" alt="" /><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost101.png" alt="" /></p>
<h2>Implementation</h2>
<p style="text-align: justify;">A universal storage model based on DDF can be implemented in a variety of ways (e.g. objects, relations, triples).  We have explored and implemented two approaches, the first using relational technology (Oracle, mySQL) and the second using cloud technology (Hadoop / HBase) [Hadoop], [HBase], [Chang 2006].  In both cases we employ the Dimensional Data Modeling (DDM) approach [Kimball 2002] because it nicely captures the business processes associated with moving intelligence artifacts upward through the cognitive hierarchy while accommodating the metadata that intelligence processing requires.  In particular, we maintain not only the contextual metadata about the indigenous artifact itself (e.g. the who what when and where of its creation and transmission), but also process metadata regarding the processing by which signs, terms and statements are created.  The former are captured in the Layer 1 storage structure as described previously while the latter are accommodated in Layer 2.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://imintel.org/blog/wp-content/uploads/2010/02/021010_1458_Renamepost111.png" alt="" />As reflected in Fig. 7, the essential intelligence business processes that the DDF captures are semantic disambiguation and association formation.  Thus, the DDF storage model consists of two main fact-tables, <span style="font-family: Arial; font-size: 10pt;">SemanticFact</span><em><br />
</em>and<em><br />
</em><span style="font-family: Arial; font-size: 10pt;">AssociationFact</span>. The <span style="font-family: Arial; font-size: 10pt;">SemanticFact</span> table records metrics relating to the formation and disambiguation of signs, and references dimension tables that record signs, concepts, and process metadata.  The signs themselves are represented using two tables, <span style="font-family: Arial; font-size: 10pt;">Sign</span> and <span style="font-family: Arial; font-size: 10pt;">Mention</span>.  The value of a mention is identified by the region of the artifact in which it is localized.  The boundary of such a region is recorded in the <span style="font-family: Arial; font-size: 10pt;">Mention</span> table.  The value of a sign may represent any number of source mentions that are exactly the same or are considered to be the same from the perspective of the process which extracts / identifies them.  The <span style="font-family: Arial; font-size: 10pt;">Concept</span> dimension records elements from the domain knowledge which includes the source artifacts&#8217; data-models.  Each record in the <span style="font-family: Arial; font-size: 10pt;">SemanticFact</span> table binds a sign to a concept using <span style="font-family: Arial;">&#8216;<span style="font-size: 10pt;">isInstanceOf&#8217;</span><span style="font-size: 9pt;"><br />
</span></span>semantics.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">
<p style="text-align: justify;">The <span style="font-family: Arial; font-size: 10pt;">AssociationFact</span> table records metrics relating to the formation of associations and references dimension tables that record statements, predicates, and process metadata.  Recall that statements come in three types – an assocation between terms (i.e. statement), an association between a term and another statement (i.e. reification), and an association between two statements (statement relation).  These are accommodated by the three subclasses of the <span style="font-family: Arial; font-size: 10pt;">Statement</span> dimension which are <span style="font-family: Arial; font-size: 10pt;">Statement0, Statement1, Statement2 </span>respectively.  The <span style="font-family: Arial; font-size: 10pt;">Predicate</span> dimension records predicates from the domain knowledge.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The <span style="font-family: Arial; font-size: 10pt;">ProcessMetadata</span> package shown in Fig. 7, represents a collection of dimensional tables used to record operational and contextual metadata about the various external processes that create <span style="font-family: Arial; font-size: 10pt;">SemanticFact</span> and <span style="font-family: Arial; font-size: 10pt;">AssociationFact</span> records.  The particular elements and formulation of this metadata would be designed to support the information assurance needs of the Intelligence Community.  Typically these would include <span style="font-family: Arial; font-size: 10pt;">Date</span>, <span style="font-family: Arial; font-size: 10pt;">Time</span>, <span style="font-family: Arial; font-size: 10pt;">Creator</span>, and <span style="font-family: Arial; font-size: 10pt;">SecurityClassification</span> dimensions.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The DDF does not prescribe or constrain the processing by which the DDF storage model would be populated, and the nature of such processing depends both on the modality and structure (or lack thereof) of the indigenous artifacts.  Nevertheless, to illustrate how DDF works, and provide more insight into the relationship between external processes and our Data Integration Framework, the interested reader may find a brief discussion of the processing by which Layers 1 and 2 would be populated in the Appendix.</p>
<p style="text-align: justify;">
<h2>Relation to Other Approaches</h2>
<p style="text-align: justify;">Data quality professionals widely recognize the importance of data integration and the need for efficient data integration approaches to redress a panoply of data quality problems [Lee 2006].</p>
<p style="text-align: justify;">
<p style="text-align: justify;">A large body of work exists on data integration approaches [Batini 1986], [<span style="font-family: Times New Roman;">Parent 1998], [Halevy 2005],[</span>Bernstein 2007], many of which have contributed to successful Enterprise Information Integration solutions. However, because they all are based on some kind of data-model harmonization (i.e. mapping), they fail to provide practical solution for ULS intelligence data integration.  In particular, data-model integration does not address <em>data</em> integration, which intelligence data processing requires. Physical data integration, typical of data warehouse applications, also requires heavy up-front data-model analysis and harmonization as well.  This activity is not only resource intensive, it often results in the loss and or distortion of data and its semantics which, in the context of intelligence, may reduce the richness and power of the data.  DDF addresses the needs of the Intelligence Community by supporting ad-hoc, lossless data integration without imposing a heavy pre-processing burden.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The Dataspaces approach introduced by Halevy et al. [Franklin 2005], [Halevy 2005], [Halevy 2006-1], [Halevy 2006-2] is similar in philosophy to the DDF in that it supports the co-existence of disparate data sources regardless of their type.</p>
<p style="text-align: justify; margin-left: 36pt;">&#8220;<span style="font-family: Times New Roman;">Dataspaces are not a data integration approach; rather, they are more of a </span><em>data co-existence<span style="font-family: Arial;"><br />
</span></em><span style="font-family: Times New Roman;">approach. The goal of Dataspaces support is to provide base functionality over all data sources, regardless of how integrated they are</span>&#8221; [Franklin 2005].</p>
<p style="text-align: justify; margin-left: 36pt;">
<p style="text-align: justify;">With this approach, a mediated (general, global) schema serves as an integration model. Individual sources participate in the integration by exposing Local-As-View (LaV) schemas that comply with the mediated schema. In practice, a LaV is implemented as a view on the source, and the mediated schema provides an interface to Dataspaces participants. To support the storage of new associations between data (a shortcoming of  virtual integration) it may be necessary to introduce a Local Store for these associations. DDF and Dataspaces represent two different approaches to data management and have different niches: Dataspaces focuses on eliminating the barriers to data access and provides some limited data integration capability, whereas DDF focuses on comprehensive data integration and supports deep semantic enrichment. The DDF can leverage the Dataspaces as a mechanism of data access, and participate within Dataspaces as a semantically rich Integrated Local Store.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">A number of commercial products, including the ones that are being used in military applications, e.g. Palantir [Palantir], are based on the object meta-model where data entities are represented by objects with dynamic properties. Although they claim to be able to accommodate data from any structured or unstructured source, in practice they impose a particular, albeit modifiable, data-model on the structured data. This requires heavy pre-processing of the source, as is typical for such solutions, and results in loss and or distortion of source data and data semantics. In addition, customers are dependent on the solution provider as they cannot perform modifications of the integration model.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The Information Model Interoperability Reference Model [Melnik 2000], [Omelayenko 2001], proposed for presenting information on the web, consists of three layers –  syntax, object, and semantic.  The syntax layer represents serialized data content, similar to our indigenous text artifacts.  The semantic layer provides semantics through data-models and languages, and the object layer provides a bridge between the two.  Unfortunately however, the IMI does not provide a practical model for the implementation of those layers and their interfaces.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The Data Reference Model (DRM) of the Federal Enterprise Architecture (FEA) aims to provide standards for the description, categorization, and sharing of data [DRF 2005].  Unlike DDF however, it does not resolve the issues of data integration and unfortunately exhibits the typical shortcomings of most physical and virtual data integration approaches.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Finally, the Common Warehouse Model (CWM) [CWM 2001] offers a standardized approach (and tools that support it) for representing and mediating the automated interchange of metadata in warehouse applications that involve multiple data sources and data processing applications.   Being focused on metadata integration, as opposed to data integration, the CWM mainly addresses issues relating to Layer 3 of our Data Integration Framework.</p>
<h2>Current &amp; Future Work</h2>
<p style="text-align: justify;">Today there is a deployed system called the Joint Intelligence Operational Capability in Iraq (JIOC-I) that essentially implements Layer 1 of our Data Integration Framework, though only for text artifacts.  Unfortunately, the JIOC-I by itself falls short of a complete integration solution because it does not address structured data in a way that exposes that structure to support further analytical processing and visualization.  In other words, it lacks Layer 2.  Consequently, there has been much criticism of the JIOC-I, along with various suggestions for &#8220;fixing&#8221; it (e.g. by extending the schema to accommodate structured data).  In contrast, we recognize the JIOC-I as a foundational element (that got it mostly right) and a first step toward a ULS intelligence system that integrates data while embracing data diversity.  Indeed, the JIOC-I was the inspiration that led us to develop the layers above, and the DDF in particular.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">While several relational database implementations of Layers 1 and 2 of our Data Architecture and Semantic Integration Framework are being developed and tested in various US Army CERDEC I2WD projects, our most ambitious implementation is being made for the Army using cloud computing technology.  Our current effort on a 52-node cloud implements all three Layers of the data architecture using the Hadoop Distributed File System (HDFS) and HBase to achieve an Ultra-Large Scale, unified &#8220;dataspace&#8221; that supports not only a diversity of data, but also a diversity of processing (Hadoop Map / Reduce [Dean 2004] analytics).  Although we reserve the performance metrics and details of the DDF implementation within the cloud computing key-value meta-model for a separate paper, it is interesting to note that cloud appears to accommodate the DDF in a  particularly efficient manner.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Other key aspects of our Data Integration Framework are described elsewhere.  [Yoakum 2008 IQIS] highlights the low barrier to entry for data integration by describing the process for lossless mechanical data ingestion which requires no costly pre-processing or data-model harmonization. Data surfing, drilling, and discovery on the DDF unified data space are described in [Yoakum 2008 DAMA].  [Yoakum 2008 SIMA] addresses the utility of DDF in Situation Management – another activity that requires rapid, ad-hoc data integration.  Finally, [Yoakum 2009] describes a formation of a contiguous and therefore navigatable integrated data space that enables &#8220;vertical&#8221; integration from artifacts to structured data and to knowledge models, and &#8220;horizontal&#8221; integration of data from various sources.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">As they are matured, we anticipate that Layers 3 and 4 of the Framework will provide fertile ground for entirely new work in knowledge interaction and perception.  Layer 3 serves as a universal substrate on which to explore, discover, and encode relationships between knowledge models that go well beyond harmonization and integration to include, for example, dissonant perspectives which can not and should not be &#8220;harmonized.&#8221;  Layer 4 provides the lenses through which the human user looks into this cauldron of knowledge, information, and data to explore and make sense of the object of his interest (e.g. a domain, a situation, an entity) according to a chosen perspective. Having all four layers present will close the loop between data and knowledge in both directions so that they may co-evolve to yield more complete and accurate understanding.  Atop the immense foundation of integrated data provided by Layers 1 and 2, Layers 3 and 4 will fuel the engines of ULS systems research for a very long way into the future.</p>
<h2>Conclusion</h2>
<p style="text-align: justify;">The Intelligence Enterprise is inexorably evolving into an Ultra-Large Scale systems world that can not, and will not, be constrained in its processes or products.  The data integration problem is but one early symptom of this burgeoning reality.  Although this knowledge does not provide a recipe for good solutions, it makes it rather easy to spot bad ones.  Unfortunately, current data integration approaches generally represent the latter.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In this paper, we have presented the first two layers of a multi-layer Data Integration and Semantic Enrichment Framework that enables deep semantic data integration in a ULS systems environment.  The model on which it is founded, the DDF, supports both horizontal and vertical data integration (i.e. across disparate data-models and from data to knowledge) by embracing the diversity of data / knowledge models and processes by which data is structured.  More importantly, the model admits a practical implementation (i.e. &#8220;hard running code&#8221;) that accommodates artifacts of any modality (e.g. text, audio, images, video, signals) in a single unified data store that enables true data integration and the continuous enrichment of data into knowledge.</p>
<h2>References</h2>
<p style="text-align: justify; margin-left: 28pt;"><span style="font-family: Times New Roman;">[Batini 1986] Batini, C. et al. <em>A comparative analysis of methodologies for database schema integration</em>, ACM Computing Surveys, (18) 4, 1986.<br />
</span></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Bernstein 2007] Bernstein P., Ho, H<span style="font-family: Arial;">. </span><em>Model Management and Schema Mappings: Theory and Practice</em>, Proceedings of VLDB Conference, 2007.</p>
<p style="text-align: justify;">
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Times New Roman; font-size: 12pt;">[Chang 2006] Chang, F., et. al. <em>Bigtable: A Distributed Storage System for Structured Data. </em>2006. <a href="http://labs.google.com/papers/bigtable.html">http://labs.google.com/papers/bigtable.html</a><br />
</span></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[CWM 2001] Object Management Group &#8220;<em>Common Warehouse Model (CWM) Specification</em>&#8220;, OMG, 2001. <a href="http://www.omg.org/docs/ad/01-02-01.pdf">http://www.omg.org/docs/ad/01-02-01.pdf</a></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Date 2004] Date, C. <em>An Introduction to Database Systems, 8<sup>th</sup> edition, </em>Addison Wesley, 2004.</p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 28pt;">[Dean 2004] Dean, J. and Ghemawat, S. MapReduce: <em>Simplified Data Processing on Large Clusters</em>. 2004. <a href="http://labs.google.com/papers/mapreduce.html">http://labs.google.com/papers/mapreduce.html</a></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 28pt;">[DRF 2005] Federal Enterprise Architecture Program &#8220;<em>The Data Reference Model</em>&#8220;, 2005. <a href="http://www.whitehouse.gov/omb/egov/documents/DRM_2_0_Final.pdf">http://www.whitehouse.gov/omb/egov/documents/DRM_2_0_Final.pdf</a></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Franklin 2005] Franklin, M., <span style="font-size: 10pt;">Halevy, A</span>., and Maier, D<span style="font-size: 10pt;">. </span><em>From Databases to Dataspaces: A New Abstraction for Information Management</em>. <em>ACM SIGMOD Record, 2005.<br />
</em></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify;"><span style="font-family: Times New Roman; font-size: 12pt;">[Hadoop] Apache Hadoop. <a href="http://hadoop.apache.org/">http://hadoop.apache.org/</a><br />
</span></p>
<p style="margin-left: 18pt;">
<p style="margin-left: 18pt;"><span style="font-family: Times New Roman;">[Halevy 2005] Halevy, A. <em>et al</em>. <em>Enterprise information integration: successes, challenges and controversies</em>, Proceedings of 24th International Conference on Management of Data, Baltimore, 2005.<br />
</span></p>
<p style="margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Halevy 2006 - 1] Halevy, A. Franklin, M., and Maier, D. <em>Principles of Dataspace Systems</em>. Proceedings of the twenty-fifth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems, 2006<em>.<br />
</em></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;"><span style="color: black;">[Halevy 2006 - 2] Halevy, A., </span>Rajaraman<span style="color: black;">, A., and </span>Ordille, J.<span style="color: black;"><br />
</span><em>Data Integration: The Teenage Years.</em> Proceedings of VLDB Conference, 2006<em>.</em></p>
<p style="margin-left: 28pt;">
<p><span style="font-family: Times New Roman;">[HBase] </span>HBase Tutiorial. <a href="http://arunma.com/2008/10/26/hbase-tutorial/">http://arunma.com/2008/10/26/hbase-tutorial/</a></p>
<p style="margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Izydor 2007] <a href="http://www.dmreview.com/authors/1086246.html">Izydor</a>, C.  and <a href="http://www.dmreview.com/authors/1086247.html"> McCollum</a>, P. <em>B<span style="color: #373632;">I, Process and Integration Trends</span></em>. DM Review Magazine, August 2007.   <a href="http://www.dmreview.com/issues/20070801/1089409-1.html?portal=data_integration">http://www.dmreview.com/issues/20070801/1089409-1.html?portal=data_integration</a></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Kimball 2002]  Kimball, R. and Ross, M. <em>The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling</em>,  Wiley,  2002.</p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;"><span style="color: black;">[Lee 2006] Lee, Y., Pipino, L., Funk, J., Wang, R. <em>Journey to Data Quality</em>, The MIT Press, Cambridge, MA,  2006</span></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Melnik 2000]  Melnik, S. and Decker, S.  <em>A layered approach to Information Modeling and Interoperability on the Web</em>. Proc. ECDL&#8217;00 Workshop on the Semantic Web, Lisbon, Portugal, Sept 2000. <a href="http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&amp;doc=2000-30&amp;format=pdf&amp;compression=&amp;name=2000-30.pdf"></a><a href="http://infolab.stanford.edu/~melnik/pub/sw00/">http://infolab.stanford.edu/~melnik/pub/sw00/</a>.</p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Northrop 2006]  Northrop, L., <em>et al.</em>, <em>Ultra-Large-Scale Systems The Software Challenge of the Future</em>,  Pittsburgh: Carnegie Mellon University,  2007. <a href="http://www.sei.cmu.edu/publications/books/engineering/uls.html">http://www.sei.cmu.edu/publications/books/engineering/uls.html</a></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Omelayenko 2001]  Omelayenko, B. and Fensel, D.  <em>An Analysis of B2B Catalogue Integration Problems.</em> Proceedings of the International Conference on Enterprise Information Systems (ICEIS-2001), July 7-10, 2001, p. 945-952.</p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Palantir] Palantir Technologies. <a href="http://www.palantirtech.com/">http://www.palantirtech.com/</a></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;"><span style="font-family: Times New Roman;">[Parent 1998] Parent, C. and Spaccapietra, S. <em>Issues and approaches of database integration</em>, Communications of the ACM, 41(5), 1998.<br />
</span></p>
<p style="text-align: justify; margin-left: 36pt;">
<p style="text-align: justify; margin-left: 28pt;"><span style="font-family: Times New Roman;">[RDF 2004] </span><span style="color: black;">RDF Core Working Group</span><span style="font-family: Times New Roman;"> &#8220;Resource Description Framework (RDF)&#8221;, W3C, 2004. <a href="http://www.w3.org/RDF/">http://www.w3.org/RDF/</a>.<br />
</span></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify; margin-left: 18pt;">[Steinberg 1998]  <span style="font-family: Times New Roman;">Steinberg, N.,  Bowman, C. L. and White F. E. <em>Revision to the JDL Data Fusion Model</em>, Joint NATO/IRIS Conference, Quebec City, October 1998.<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Yero 2008] Yero, J. <span style="color: black;"><em>Logical vs. Physical Data Integration: A Practical Decision Guide</em>,  The DAMA International Symposium &amp; Wilshire Meta-Data Conference. San-Diego, CA, 2008.<br />
</span></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Yoakum 2008 IQIS] Yoakum-Stover, S. and Malyuta, T. <em>Unified Architecture for Integrating Intelligence Data</em>, Proceedings of MIT Information Quality Industry Symposium, MIT, Cambridge, MA, 2008.  <a href="blockedhttp://blog.systover.net/">http://blog.systover.net/</a></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Yoakum 2008 DAMA] Yoakum-Stover, S. and Malyuta, T. <em>Unified Integration Architecture for Intelligence Data</em>, Proceedings of DAMA International Europe Conference, London, UK, 2008.  <a href="blockedhttp://blog.systover.net/">http://blog.systover.net/</a></p>
<p style="text-align: justify; margin-left: 18pt;">
<p style="text-align: justify; margin-left: 18pt;">[Yoakum 2008 SIMA] Yoakum-Stover, S. and Malyuta, T. <em>Unified Data Integration for Situation Management</em>, Proceedings of the 4th IEEE Workshop on Situation Management (SIMA 2008) at MILCOM 2008, San Diego CA, 2008.  <a href="blockedhttp://blog.systover.net/">http://blog.systover.net/</a></p>
<p style="text-align: justify; margin-left: 18pt;">
<h3><span style="font-family: Times New Roman; font-size: 12pt;">[Yoakum 2009] Yoakum-Stover, S., Malyuta, T., and Antunes, N<em>. A Data Integration Framework with Full Spectrum Fusion Capabilities.</em></span><br />
<span style="font-family: Times New Roman; font-size: 12pt;">MSS Information Fusion Symposium, Las Vegas, NV, Aug 3-7, 2009. </span> <a href="blockedhttp://blog.systover.net/"><span style="color: blue; font-family: Times New Roman; font-size: 12pt; text-decoration: underline;">http://blog.systover.net/</span></a><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></h3>
<h2>Appendix &#8211; Processing</h2>
<h3>Ingestion</h3>
<p style="text-align: justify;">Consider first, processes that load indigenous artifacts into Layer 1 either physically or virtually so that they may be unambiguously referenced within Layer 2. Typically these are called ingestion processes. Such processes insert either the entire indigenous artifact, or a reference to its location within the authoritative data source, into Layer 1.  In addition, both artifact and process metadata are recorded in the appropriate metadata tables.  The former essentially provides a card catalogue for the artifact and the latter provides information assurance.</p>
<h3>Unstructured Information</h3>
<p style="text-align: justify;">Processes that structure unstructured artifacts generate SemanticFact and AssociationFact records in Layer 2.  Each such process necessarily entails a particular data-model.  This data-model is persisted in Layer 3.  Concepts and predicates from the data-model (or references to them) are also persisted in the <span style="font-size: 10pt;"><span style="font-family: Arial;">Concept</span><br />
</span>and <span style="font-family: Arial; font-size: 10pt;">Predicate</span> dimension tables of Layer 2 along with sufficient metadata to identify and retrieve the data-model source artifact (i.e. schema, ontology, etc..).</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Unstructured information processing typically identifies all instances of the concepts within its data-model or type system.  For example, a given text extractor may identify all ocurrences of  <span style="font-family: Arial; font-size: 10pt;">&#8216;IBM&#8217; </span>and associate them with the concept<span style="font-family: Arial; font-size: 10pt;"> &#8216;Company.&#8217; </span> Each such instance is represented as a DDF mention.  The position of each mention within the source artifact is recorded in the <span style="font-family: Arial; font-size: 10pt;">Mention</span> table (e.g. using <span style="font-family: Arial; font-size: 10pt;">beginChar</span>, <span style="font-family: Arial; font-size: 10pt;">endChar</span>) and a single record is added to the <span style="font-family: Arial; font-size: 10pt;">Sign</span> table using, for example, the actual contents of the span (<span style="font-family: Arial; font-size: 10pt;">&#8216;IBM&#8217;) </span>as the sign value.  Each disambiguation ocurrence (i.e. the association made by the text extractor between a mention and a concept) is recorded in the <span style="font-family: Arial; font-size: 10pt;">SemanticFact table</span> along with appropriate process metadata, and a term consisting of <span style="font-family: Arial; font-size: 10pt;">&lt;sign, concept&gt; </span>is created in the <span style="font-family: Arial; font-size: 10pt;">Term</span> table (if such term does not already exist).</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Further semantic processing may identify relationships between elements within the artifact.  The elements themselves would have already been recorded as SemanticFacts.   For each such relationship, an AssociationFact is recorded along with appropriate process metadata, and a <span style="font-family: Arial; font-size: 10pt;">Statement</span> table entry is created.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Unstructured information processing of other than text artifacts is similar,  the main differences being that entries in the <span style="font-family: Arial; font-size: 10pt;">Mention</span> table will have a different <span style="font-family: Arial; font-size: 10pt;">spanCoordinateType</span>, and the method for assigning a sign value will be different. For example, consider object recognition software that extracts faces from within an image of a crowd.  For each extracted face, the corresponding rectangular area of the image could be recorded in the <span style="font-family: Arial; font-size: 10pt;">Mention</span> table with the help of <span style="font-family: Arial; font-size: 10pt;">pixelUpperLeft</span> and  <span style="font-family: Arial; font-size: 10pt;">pixelLowerRight</span>, and a sign (e.g. <span style="font-family: Arial; font-size: 10pt;">&#8216;Suzi&#8217;s faceImage&#8217;)</span> would be assigned to all extracted mentions.</p>
<h3>Extract-Transform-Load</h3>
<p style="text-align: justify;">Consider next, Extract-Transform-Load (ETL) processes that pull data from other structured data sources, typically databases, into Layer 2.  The initial phase of the ETL loads the source data-model (e.g. database data dictionary) into Layer 3, and concepts and predicates (or their references) into in the <span style="font-family: Arial; font-size: 10pt;">Concept</span> and <span style="font-size: 10pt;"><span style="font-family: Arial;">Predicate</span><br />
</span>dimension tables of Layer 2.  Sufficient metadata necessary to identify and retrieve the data-model source artifact (i.e. schema), are also stored.  Subsequent ETL processing, which entails a mapping to the DDF structure, inserts signs, terms, and statements into the <span style="font-family: Arial; font-size: 10pt;">SemanticFact</span> and <span style="font-family: Arial; font-size: 10pt;">AssociationFact</span> tables along with appropriate process metadata.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Because the ETL process needs only to capture the explicit semantics of the source meta-model (e.g. relational, hierarchical, graph…), one ETL can be developed for a whole class of data stores.  For example a discussion of ETL for relational stores may be found in [Yoakum 2008 IQIS].</p>
<h3>Interactive</h3>
<p>Finally, consider an interactive user interface that enables an analyst to assert semantic and association facts directly into the DDF. The analyst will have the option to use existing concepts, predicates, terms, and statements or to create new ones.  In the case of the latter, recorded and asserted mentions will reference the source analyst. Metadata recorded for manual processes with also reference the source analyst</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/tIYhzQJ338o" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2010/02/10/unified-architecture-for-integrating-intelligence-data-3/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2010/02/10/unified-architecture-for-integrating-intelligence-data-3/</feedburner:origLink></item>
		<item>
		<title>Board of Director’s Meeting July 14, 2009</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/qMEB5RZwGQ8/</link>
		<comments>http://imintel.org/blog/2009/11/02/news-headline-2/#comments</comments>
		<pubDate>Mon, 02 Nov 2009 19:41:46 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.heroicdreams.com/missionfocus/?p=19</guid>
		<description><![CDATA[First Annual Board of Directors Meeting Date: July 14, 2009 Time: 1600 hrs Location: Suite 313, 510 King Street, Alexandria VA 22314 Highlights of the agenda include a review of the articles of incorporation, bylaws, treasurer&#8217;s report, new business, and announcements.]]></description>
			<content:encoded><![CDATA[<p style="text-align: left;"><strong>First Annual Board of Directors Meeting</strong></p>
<p style="text-align: left;"><strong>Date: </strong>July 14, 2009</p>
<p style="text-align: left;"><strong>Time: </strong>1600 hrs</p>
<p style="text-align: left;"><strong>Location: </strong>Suite 313, 510 King Street, Alexandria VA 22314</p>
<p>Highlights of the agenda include a review of the articles of incorporation, bylaws, treasurer&#8217;s report, new business, and announcements.</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/qMEB5RZwGQ8" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2009/11/02/news-headline-2/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2009/11/02/news-headline-2/</feedburner:origLink></item>
		<item>
		<title>Institute for Modern Intelligence Incorporates</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/TCw9yVB121M/</link>
		<comments>http://imintel.org/blog/2009/11/02/news-headline-1/#comments</comments>
		<pubDate>Mon, 02 Nov 2009 19:41:33 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.heroicdreams.com/missionfocus/?p=17</guid>
		<description><![CDATA[On April 7, 2009, the Institute for Modern Intelligence became a non-profit corporation with the state of Virginia. The directors are listed as: Mark Andrew Eick Suzanne Yoakum-Stover Oscar Wood]]></description>
			<content:encoded><![CDATA[<p>On April 7, 2009, the Institute for Modern Intelligence became a non-profit corporation with the state of Virginia.</p>
<p>The directors are listed as:</p>
<ol>
<li>Mark Andrew Eick</li>
<li>Suzanne Yoakum-Stover</li>
<li>Oscar Wood</li>
</ol>
<img src="http://feeds.feedburner.com/~r/imintel/~4/TCw9yVB121M" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2009/11/02/news-headline-1/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2009/11/02/news-headline-1/</feedburner:origLink></item>
		<item>
		<title>Job Postings Coming soon…</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/GfWTLpgyfy4/</link>
		<comments>http://imintel.org/blog/2009/11/02/job-1/#comments</comments>
		<pubDate>Mon, 02 Nov 2009 19:39:47 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[Jobs]]></category>

		<guid isPermaLink="false">http://www.heroicdreams.com/missionfocus/?p=9</guid>
		<description><![CDATA[Check back for updated job postings&#8230;.]]></description>
			<content:encoded><![CDATA[<p>Check back for updated job postings&#8230;.</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/GfWTLpgyfy4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2009/11/02/job-1/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2009/11/02/job-1/</feedburner:origLink></item>
		<item>
		<title>Unified Data Integration for Situation Management</title>
		<link>http://feedproxy.google.com/~r/imintel/~3/o5JUfTNTbuQ/</link>
		<comments>http://imintel.org/blog/2009/10/01/unified-data-integration-for-situation-management/#comments</comments>
		<pubDate>Thu, 01 Oct 2009 19:50:59 +0000</pubDate>
		<dc:creator>Andrew Eick</dc:creator>
				<category><![CDATA[Publications]]></category>
		<category><![CDATA[White Papers]]></category>
		<category><![CDATA[dataspace]]></category>

		<guid isPermaLink="false">http://www.imintel.org/blog/?p=12</guid>
		<description><![CDATA[We propose a new solution for data integration and semantic enrichment in support of Situation Management (SIMA). Our solution applies to any modality (e.g. text, images, audio, signals etc.) and embraces the diversity of data sources, types, and models, placing no restrictions on processes, applications, or users. It is database centric and proceeds in stages to address the unified storage of structured data and its semantic enrichment in a way that remains viable in an Ultra-Large Scale systems environment. The result is a layered data integration architecture that can accommodate any kind of data to coherently support the multiplicity of processing required for SIMA.]]></description>
			<content:encoded><![CDATA[<p style="text-align: left;"><a title="Printable copy of article" href="http://imintel.org/blog/wp-content/uploads/sima2008_2134_2-1.pdf">Printable copy of article</a></p>
<p style="text-align: left;"><em>4th IEEE Workshop on Situation Management (SIMA2008) at MILCOM 2008, San Diego CA, 2008</em></p>
<p style="text-align: center;">
<p style="text-align: center;">S. Yoakum-Stover, Ph.D.</p>
<p style="text-align: center;">Potomac Institute for Policy Studies</p>
<p style="text-align: center;">US Army CERDEC I2WD Information Exploitation Futures Lab</p>
<p style="text-align: center;">
<p style="text-align: center;">T. Malyuta, Ph.D.</p>
<p style="text-align: center;">New York City College of Technology</p>
<p style="text-align: center;">Computer Systems Technology Department</p>
<p style="text-align: center;">
<h2>Abstract</h2>
<p style="text-align: justify; margin-left: 36pt;">We propose a new solution for data integration and semantic enrichment in support of Situation Management (SIMA).  Our solution applies to any modality (e.g. text, images, audio, signals etc.) and embraces the diversity of data sources, types, and models, placing no restrictions on processes, applications, or users.  It is database centric and proceeds in stages to address the unified storage of structured data and its semantic enrichment in a way that remains viable in an Ultra-Large Scale systems environment.  The result is a layered data integration architecture that can accommodate any kind of data to coherently support the multiplicity of processing required for SIMA.</p>
<p style="text-align: justify; margin-left: 36pt;">
<h2>Challenge of Data Integration in Situation Management</h2>
<p style="text-align: justify;">Though generally scoped around a particular set of circumstances, or state of affairs, Situation Management (SIMA) is a mega-process occurring in a heterogeneous and volatile data space resulting from a cacophony of human and automated systems.  To understand a situation and engineer the means for managing it, we must organize its data space.  In particular, the heavy load of sophisticated processing for the anticipation, recognition, and influence of a situation must be girded with an architecture that enables data sourced from wildly disparate systems, having different modalities, structures, and semantics, to be integrated into one coherent body of situational knowledge.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">In most business intelligence applications, data is integrated across information systems to support a choreographed interplay of services comprising an established set of business processes.   In contrast, the constituent events in SIMA typically entail information systems that are far more diverse and whose dynamic interplay is less scripted, less repeatable, and therefore less predictable.  Since many of these information systems capture data for completely different and unrelated purposes, and were never intended as participants in a coherent process, for SIMA we require a data architecture that enables them to be dynamically re-used or re-purposed.  Because every situation is unique and we cannot anticipate all the right &#8220;business processes,&#8221; we need the capability to quickly fuse data often in high volumes from an ad-hoc set of systems, sometimes with knowledge asserted by analysts, in meaningful ways on the fly.</p>
<p><span id="more-12"></span></p>
<p style="text-align: justify;">
<p style="text-align: justify;">Traditional approaches to data integration, both physical and virtual [Batini 1986, <span style="font-family: Times New Roman;">Parent 1998, Halevy 2005, </span>Bernstein 2007], cannot accommodate the complexity, heterogeneity, and volatility of the SIMA data space.  In actual practice, the canonical data-models that underlie such approaches, including federation, are simply too rigid.  They cannot adapt their structure to handle new data sources, associations, processes, or applications without heavy manual intervention.  Moreover, such approaches generally result in the loss and or distortion of data, semantics, and context, all of which may be useful or even critical in SIMA.  Even if initially successful, the IT costs associated with sustaining such systems as well as the human costs resulting from their deficiencies can be devastatingly high.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The scale and complexity of SIMA places it squarely in the domain of Ultra-Large-Scale systems which are characterized by decentralization; inherently conflicting, diverse, and  unknowable requirements; heterogeneous, changing and inconsistent elements; normal failures; continuous operation, evolution, and deployment; and immense scale along many dimensions [Northrop 2006].  As such, SIMA demands a supporting data architecture that remains viable in a freely evolving, interdependent collective of systems, people, policies, cultures, and economics, very little of which will ever be under our control. Our objective is to define such a solution.</p>
<p style="text-align: justify;">
<h2>Data Description Framework</h2>
<p style="text-align: justify;">To organize the SIMA data space in a ULS systems environment, we enable semantic data integration by providing for the unified storage of structured data.  We embrace the diversity of domain-specific data-models by taking a data-model agnostic approach wherein the integration model makes the least possible commitment to any particular data-model.  We achieve this by identifying the universal aspects inherent in all structured data and creating an integration model based on that.  A key aspect of our approach is that the character and meaning of the source data-model is preserved and made accessible by the data store.  The result is a data architecture that can accommodate any kind of data without placing restrictions on vocabulary, structure, semantics, or constraints, in a way that addresses the needs of the SIMA Community today while providing a seamless transition path toward a future of ULS systems imbued with semantic technologies.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The key to devising a domain-neutral storage model for structured data is to decouple that which varies, namely vocabularies and, more generally the data-models, from that which remains constant, namely the source artifact, and ideally the storage structure.  To achieve this, we consider structure, vocabulary, semantics, and constraints from a higher level of abstraction from which we then distill a minimal set of elements sufficient to capture any data-model.  These are illustrated in Fig. 1 and defined as follows:</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Sign: </em></strong>A <em>sign</em> is a chunk of data, either physically located within a tangible artifact, or contained within an analyst&#8217;s mind.  Examples of the former include a string of text in a document; an object within an image; a segment of audio in an audio stream; a spike in a signal.  As illustrated in Fig. 1, regardless of the type of medium, tangible signs are always associated with a physical extent (i.e. quantifiable span which we call a mention) within the artifact.  In contrast, signs that reside in an analyst&#8217;s mind become tangible when she writes down her thoughts.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Concept:</em><br />
</strong> A <em>concept</em> is an abstract idea, defined explicitly or implicitly by a source data-model.  For example, the nodes of an ontology, the tag set in an XML Schema Document (XSD), and the attribute / table names in a relational database all represent concepts.  <em>Concept</em> is an abstraction of such representations, which in the example of Fig. 1 includes <span style="font-family: Arial; font-size: 10pt;">Message</span>, <span style="font-family: Arial; font-size: 10pt;">Person</span>, and <span style="font-family: Arial; font-size: 10pt;">Body_text</span>.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata1.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata2.png" alt="" /></p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Predicate:</em></strong> A <em>predicate</em> is an abstract idea used to express a relationship between &#8220;things.&#8221;  They are used in the formation of <em>statements </em>(described below) and may be defined either explicitly or implicitly by a source data-model.  For example, the arcs of an ontology, and the attributes of an XML or database schema represent <em>predicates</em>.   In Fig. 1, <span style="font-family: Arial; font-size: 10pt;">To</span>, <span style="font-family: Arial; font-size: 10pt;">From,</span> and <span style="font-family: Arial; font-size: 10pt;">Body</span> represent <em>predicates</em>.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Term: </em></strong> A <em>term</em> is a disambiguated <em>mention</em> abstracted from the source artifact or asserting analyst.  The process of disambiguation associates a <em>mention</em> with a <em>concept,</em> implicitly using the <span style="font-family: Arial; font-size: 10pt;">IsInstanceOf</span><br />
<em>predicate</em>.  However, not every such pairing results in a distinct <em>term</em>.  All <em>signs</em> that are identical, and that are identified as having the same meaning, are represented by a single <em>term</em>. In the example of Fig. 1, <span style="font-family: Arial; font-size: 10pt;">Suzi</span><br />
<span style="font-family: Arial; font-size: 10pt;">IsInstanceOf</span><br />
<span style="font-family: Arial; font-size: 10pt;">Person </span>represents a <em>term</em>.<span style="font-family: Arial; font-size: 10pt;"><br />
</span></p>
<p style="text-align: justify;">
<p style="text-align: justify;"><strong><em>Statement: </em></strong>A <em>statement</em> encodes a binary relationship between a subject and an object mediated by a <em>predicate</em>. In our design, subject and object may be either a <em>term</em> or <em>statement</em>.  The simplest kind of <em>statement</em> is one in which subject and object are <em>terms</em>.  <em>Statements</em> in which the object is itself another <em>statement</em> represent reifications.  Finally, a <em>statement</em> in which both subject and object are other <em>statements</em> represents a relationship between <em>statements</em>.  In Fig. 1, we see three <em>statements</em>, all with the same subject, which is the <em>term</em> corresponding to the message itself.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">This organization of these elementary constructs (sign, concept, predicate, term, and statement) defines a data reference model, which we call the Data Description Framework (DDF) [Yoakum 2008 DAMA].  Because it effectively decouples data from data-models and structured data from data-structures, it can encapsulate any sort of data-model and support any data-structure.  Because it binds knowledge to data, it enables deep data integration and semantic enrichment.  Because it provides a foundation for implementing a stable database, it serves as a practical data integration platform.</p>
<p style="text-align: justify;">
<p>In the subsequent text, we represent mentions, concepts, and predicates using <span style="font-family: Arial; font-size: 10pt;">Arial</span> font.  Terms are denoted as<span style="font-family: Arial; font-size: 10pt;"> [mention, concept] (</span>e.g.<span style="font-family: Arial; font-size: 10pt;"> [Adam, Chemist]) </span>and statements are denoted using an intuitive triple representation, e.g. <span style="font-family: Arial; font-size: 10pt;">[Adam, Chemist] hasInventoryID [1001,InventoryID].</span></p>
<h2>The Unified Data Space</h2>
<p style="text-align: justify;">As illustrated in Fig. 2, the DDF forms a layer of data and semantics (Layer 2) lying between the indigenous source systems (Layer 1) and their knowledge models (Layer 3).   Layer 1 feeds the layers above, and Layers 2 and 3 interact:  Layer 3 provides semantic context for Layer 2 and Layer 2 participates in the formation of an overarching knowledge model in Layer 3.   Together Layers 2 and 3 form what we call the unified DDF data space.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">
<h2>Illustrative Example</h2>
<p style="text-align: justify;">To convey a more tangible understanding of the DDF to the user, in this section we present a simplified example that illustrates:</p>
<p style="text-align: justify;">
<ul>
<li>
<div style="text-align: justify;">Loading three disparate data sources into the DDF</div>
</li>
<li>
<div style="text-align: justify;">Surveying the resulting integrated data space</div>
</li>
<li>
<div style="text-align: justify;">Enhancing the data space with additional semantic associations</div>
</li>
<li>Exploring the enriched data space</li>
</ul>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata3.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata4.png" alt="" /></p>
<h3>Loading the DDF</h3>
<p style="text-align: justify;">Loading structured data into a DDF store is a straightforward, mechanical Extract – Transform – Load (ETL) process.  This process maps the original data and semantics into the DDF using a pattern that depends primarily on the type of data source because it needs only to capture the structure and semantics of the relational metamodel (not the structure and semantics of a specific instance).  For example, our prototype loader works out-of-the-box for most relational databases, extracting data structure and data from the source&#8217;s data dictionary and relations as follows:</p>
<p style="text-align: justify;">
<ul>
<li>
<div style="text-align: justify;">Data instances <span style="font-family: Symbol;">®</span> signs</div>
</li>
<li>
<div style="text-align: justify;">Table attributes <span style="font-family: Symbol;">®</span> concepts</div>
</li>
<li>
<div style="text-align: justify;">Signs are bound to their respective concepts to form terms</div>
</li>
<li>
<div style="text-align: justify;">Predicates are derived from non-key attributes (i.e. concepts) using &#8216;has&#8217; semantics.  For example the predicate derived from the concept <span style="font-family: Arial; font-size: 10pt;">Project</span> is <span style="font-family: Arial; font-size: 10pt;">hasProject.</span></div>
</li>
<li>
<div style="text-align: justify;">Within a record, terms associated with primary key columns are semantically linked via derived predicates to terms associated with non-primary key columns to form statements.  For example, <span style="font-family: Arial; font-size: 10pt;">[Adam, ChemistName] hasProject [P1, Project].</span></div>
<p style="text-align: justify;">
</li>
</ul>
<p style="text-align: justify;"><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata5.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata6.png" alt="" />Figures 4 and 5 illustrate the result of the mechanical ETL for the three data sources shown in Fig. 3.  For the purpose of our illustration, we assume that everything from the sources presented in Fig. 3 is loaded, but this need not be the case. We may freely choose which parts of a data source to load and when to load them.  For example, we may choose to load specific views of the source data, or perhaps only the structure of a data source, lazily loading instances only when requested.  Finally, the DDF can (and should) capture any desired metadata associated with the source artifacts, the ETL process itself, the quality / strength of semantic and association facts, or any other aspects of the data space elements. For simplicity we do not illustrate this.</p>
<p style="text-align: justify;">
<h3>Surveying the Unified Data Space Floor</h3>
<p style="text-align: justify;">We refer to the integrated data space that results simply from loading data into the DDF as the Unified Data Space Floor.   We may explore this space through querying.  For example, we may observe the spectrum of semantics of the sign <span style="font-family: Arial; font-size: 10pt;"><em>Adam</em></span> by issuing a query that asks, &#8216;What is <span style="font-family: Arial; font-size: 10pt;">Adam</span>?&#8217;  The result set will include all the concepts associated with the sign <span style="font-family: Arial; font-size: 10pt;">Adam</span><em><br />
</em>across all sources (i.e. <span style="font-family: Arial; font-size: 10pt;">ChemistName </span>and <span style="font-family: Arial; font-size: 10pt;">Chemist</span>).  Note that this simple yet penetrating question cannot be answered by any traditional data integration solution.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Another simple but useful question that traditional data integration solutions cannot answer is:  &#8216;Which data elements (i.e. signs)<em><br />
</em>in source B also appear in source C?&#8217;<sup><br />
</sup> The result is: <span style="font-family: Arial; font-size: 10pt;">E1001, E2119, </span>and<span style="font-family: Arial; font-size: 10pt;"> E3327</span>.  By looking at the range of concepts associated with this result set, one may glean useful insight for data-model harmonization.  For example, we find that <span style="font-family: Arial; font-size: 10pt;">E1001 </span>is associated with the concept <span style="font-family: Arial; font-size: 10pt;">InventoryID</span> in source B and <span style="font-family: Arial; font-size: 10pt;">EquipCode</span> in source C.   An analyst might suspect therefore, that that the two concepts are the same, and if confirmed, assert this equivalence at the data-model level. Thus insight obtained by the analysis of data instances may be applied more broadly as knowledge at the data-model level. This is but one example of how Layer 2 can inform Layer 3.</p>
<p><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata7.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata8.png" alt="" /></p>
<p style="text-align: justify;">
<p style="text-align: justify;">By chaining such queries we can explore semantic associations and traverse unified data space floor.  For example, we may ask:</p>
<p style="text-align: justify;">
<ol>
<li>
<div>Query:  What terms are associated with the sign <span style="font-family: Arial; font-size: 10pt;">L1? </span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></div>
<p><span style="font-family: Times New Roman; font-size: 12pt;">Result: </span><span style="font-family: Arial; font-size: 10pt;">[E1001, EquipCode], [E3327, EquipCode]</span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></p>
<p><span style="font-family: Times New Roman; font-size: 12pt;">Analyst thinks:  &#8216;This stuff is located in the same lab.&#8217;<br />
</span></li>
<li>
<div><span style="font-family: Times New Roman; font-size: 12pt;">Query:  What other concepts are associated with signs </span><span style="font-family: Arial; font-size: 10pt;">E1001 </span><span style="font-family: Times New Roman; font-size: 12pt;">and</span><span style="font-family: Arial; font-size: 10pt;"> E3327? </span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></div>
<p><span style="font-family: Times New Roman; font-size: 12pt;">Result: </span><span style="font-family: Arial; font-size: 10pt;">InventoryID</span><span style="font-family: Times New Roman; font-size: 12pt;"> (from source B)<br />
</span></p>
<p><span style="font-family: Times New Roman; font-size: 12pt;">Analyst thinks: &#8216;I wonder if </span><span style="font-family: Arial; font-size: 10pt;">EquipCode</span><span style="font-family: Times New Roman; font-size: 12pt;"> is the same thing as </span><span style="font-family: Arial; font-size: 10pt;">InventoryID</span><span style="font-family: Times New Roman; font-size: 12pt;">.&#8217;<br />
</span></li>
<li><span style="font-family: Times New Roman; font-size: 12pt;">Query:  Which signs of </span><span style="font-family: Arial; font-size: 10pt;">EquipCode </span><span style="font-family: Times New Roman; font-size: 12pt;">match signs of</span><span style="font-family: Arial; font-size: 10pt;"> InventoryID?</span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></li>
</ol>
<p><span style="font-family: Times New Roman; font-size: 12pt;">Result: </span><span style="font-family: Arial; font-size: 10pt;">E1001, E2119, E3327</span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></p>
<p style="margin-left: 36pt;"><span style="font-family: Times New Roman; font-size: 12pt;">Analyst thinks:  &#8216;The concepts </span><span style="font-family: Arial; font-size: 10pt;"><a name="OLE_LINK1"></a>EquipCode </span><span style="font-family: Times New Roman; font-size: 12pt;">and</span><span style="font-family: Arial; font-size: 10pt;"> InventoryID </span><span style="font-family: Times New Roman; font-size: 12pt;">probably do mean the same thing.&#8217;<br />
</span></p>
<ol>
<li><span style="font-family: Times New Roman; font-size: 12pt;">Query:  What other concepts are associated with</span><span style="font-family: Arial; font-size: 10pt;"> InventoryID?</span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></li>
</ol>
<p><span style="font-family: Times New Roman; font-size: 12pt;"> Result: </span><span style="font-family: Arial; font-size: 10pt;">Chemist<br />
</span></p>
<ol>
<li><span style="font-family: Times New Roman; font-size: 12pt;">Query: Which </span><span style="font-family: Arial; font-size: 10pt;">Chemists</span><span style="font-family: Times New Roman; font-size: 12pt;"> are associated with </span><span style="font-family: Arial; font-size: 10pt;">[E1001,InventoryID] and [E3327,InventoryID]?</span><span style="font-family: Times New Roman; font-size: 12pt;"><br />
</span></li>
</ol>
<p><span style="font-family: Times New Roman; font-size: 12pt;"> Result: </span><span style="font-family: Arial; font-size: 10pt;">[Adam, Chemist], [Mary, Chemist]<br />
</span></p>
<p style="margin-left: 36pt;"><span style="font-family: Times New Roman; font-size: 12pt;">Analyst thinks:  &#8216;Adam and Mary have equipment in the same lab, so they probably know each other.&#8217;<br />
</span></p>
<p style="text-align: justify;">
<p style="text-align: justify;">These queries illustrate the ability to perform &#8220;semantic drilling&#8221; into the DDF data space.  We can ask series of questions that &#8220;surf&#8221; across the entire DDF data space unimpeded by barriers between source systems.  One need not have specific semantic knowledge of the source systems in order to explore the data space this way and to extract useful insight.  In the next section we will illustrate how this insight may be subsequently inserted back into the data space, as additional information and knowledge, to produce further semantic enrichment and fusion.</p>
<p><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata9.png" alt="" /></p>
<h3>Enhancing the Data Space</h3>
<p style="text-align: justify;">Up to this point, we have discussed the data integration and analytic power of the unified data space floor that results simply by the mechanical loading of data into Layer 2.  The breadth of integration, depth of semantic enrichment, and analytic power can all be dramatically improved by building upon this floor, either by an analyst or an automated process.  This can be performed at the data instance level (Level 2), the data-model level (Level 3), or the combination of the two.  The first regards the assertion of new instances of DDF elements (i.e. signs, terms, concepts, predicates, and statements).  The second regards the enhancement and or harmonization of source specific data-models.  The third regards the association of concepts and predicates asserted in Level 2 with existing knowledge models in Level 3.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">For example, as is illustrated in Fig. 5, we may introduce the predicate <span style="font-family: Arial; font-size: 10pt;">isEquivalent</span> and use it to assert the statement that [<span style="font-family: Arial; font-size: 10pt;">Ben, ChemistName] isEquivalent [Benjamin, Chemist]</span>.  Such statements, created at the data instance level, represent <em>data</em> integration.  In addition, we may assert new associations at the data-model level to achieve global <em>data-model</em> integration (e.g. harmonization).  This is illustrated in Fig. 6 wherein, concept <span style="font-family: Arial; font-size: 10pt;">ChemistName</span> is asserted to be the same as concept <span style="font-family: Arial; font-size: 10pt;">Chemist</span>.  The result of this assertion is that the <em>meaning</em> of all <span style="font-family: Arial; font-size: 10pt;">ChemistName</span> terms becomes <span style="font-family: Arial; font-size: 10pt;">sameAs</span> the <em>meaning</em> of all <span style="font-family: Arial; font-size: 10pt;">Chemist</span><em><br />
</em>terms.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata10.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata11.png" alt="" /></p>
<h3>Exploring the Enriched Data Space</h3>
<p style="text-align: justify;">As we explore the enriched data space, surfing semantics and drilling associations, we find that previously disjoint regions of the space become reachable via the newly asserted data and associations.  For example, having equated the concept <span style="font-family: Arial; font-size: 10pt;">ChemistName </span>with <span style="font-family: Arial; font-size: 10pt;">Chemist</span>, and <span style="font-family: Arial; font-size: 10pt;">InventoryID </span>with <span style="font-family: Arial; font-size: 10pt;">EquipCode, </span>an analyst can simply retrieve the projects that are located in a particular lab with basically one query.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">Query:  Which terms are associated with <span style="font-family: Arial; font-size: 10pt;">[L1, lab]</span>?</p>
<p style="text-align: justify;">Result:  <span style="font-family: Arial; font-size: 10pt;">[E1001, EquipCode], [Adam, Chemist], [P1, Project]<br />
</span></p>
<p style="text-align: justify;">
<p style="text-align: justify;">Fig. 6 shows how the asserted associations (dashed) at the data-model level enable additional associations (dotted) to be inferred.  This interplay of data and data-model integration is what ultimately allows us to &#8220;connect the dots.&#8221;</p>
<p style="text-align: justify;">
<h2>Application to SIMA</h2>
<p style="text-align: justify;">To enable the rapid, ad-hoc assimilation of diverse data into situational views useful for SIMA, we must overcome system, structural, and semantic barriers between data sourced from different systems.  As illustrated in Fig. 7, traditional data integration approaches attempt to achieve this by imposing a tight commitment to a particular data-model or integration schema (i.e. canonical data-model).  Unfortunately, choosing which of the source data element to expose and mapping them to the canonical model inevitably leads to information loss, and or distortion, and the integration schema itself creates yet another semantic barrier.</p>
<p style="text-align: justify;">In contrast, the DDF breaks the barriers between data sources to accommodate all within a single coherent data space.  Simply loading data into the DDF in a largely automated fashion produces a fundamental level of data unity &#8211; the Unified Data Space Floor.  No data-model harmonization need be made and yet non-trivial data integration results.  Upon this floor, the DDF supports the construction of deeper integration and semantic enrichment at both the data instance and data-model levels without prescribing or constraining the processing by which such enrichment may be achieved.  Any fusion or data integration method can be applied alone or in combination.  Moreover, unlike other integration approaches, new data and associations, regardless of their origin, join seamlessly into the unified data space.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The DDF data space also supports the complete spectrum of applications and clients, from generic (i.e. those operating at the level of the DDF structure) to specific (i.e. those that have knowledge of a particular source data-model).  Generic clients seamlessly span across the entire data space regardless of data source or associated data-model to perform analysis.  Such clients require no modification as new data or semantics are introduced.  Specific clients are able to operate with the same semantic depth in the DDF data space as they would on the source system itself since the DDF data space preserves the data and semantics of the source systems.  In other words, the expressiveness and search capability native to those systems are retained [Yoakum 2008 JDIQ].  As the data space is increasingly enriched with semantics that bridge data-models, the depth of specific clients is retained while their breadth increasingly widens toward that of a generic client.</p>
<p style="text-align: justify;">
<p style="text-align: justify;"><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata12.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata13.png" alt="" /><img src="http://systover.net/blog/wp-content/uploads/2009/01/010209-0221-unifieddata14.png" alt="" /></p>
<h2>Conclusion</h2>
<p style="text-align: justify;">Successfully executing the constellation of activities that comprise SIMA, particularly in support of decision-making, requires exploiting information within a dynamic, heterogeneous, and distributed data environment that is largely beyond our control.  The challenge therefore, is to dynamically integrate data, information, and knowledge into one coherent intelligence repository to serve as a foundation for SIMA processes and operations.  Current practice is insufficient in the face of scale and complexity.</p>
<p style="text-align: justify;">The approach presented in this paper overcomes the shortcomings of traditional data integration approaches using a framework, called the Data Description Framework, which enables the seamless integration of any structured data within and across data sources and models without the loss or distortion of data and semantics.  Moreover, the framework supports a practical, stable implementation using any standard database system.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The simple, mechanical loading of source data and semantics into the DDF creates a unified data space floor that exhibits a primary level of data integration unmatched by traditional integration approaches. No up-front, heavy investment in data-model harmonization is required – one simply pours data on the floor.  Deeper integration and semantic enrichment may then be pursued with any manual or automated processing operating either at the data instance or data-model levels.</p>
<p style="text-align: justify;">
<p style="text-align: justify;">The ultimate analytic power that is enabled by the DDF data space is essentially unlimited and exceeds that of any particular source system or traditional data integration solution at any level.  Having the power and flexibility required to organize the transient and complex SIMA data space, it provides the ideal foundation on which to pursue SIMA.</p>
<p style="text-align: justify;">
<h2>Acknowledgements</h2>
<p style="text-align: justify;">The authors would like to thank the following US Army CERDEC I2WD personnel for their continued support:  Mr. Anthony Lisuzzo, Director, Mr. Kesny Parent, DCGS-A Branch Chief, Ms. Virginia Goon IXFL Manager, and Mr. Norbert Antunes IXFL Computer Engineer.  This work was funded by US Army CERDEC I2WD under contract number W15P7T-06-D-A401/009.</p>
<p style="text-align: justify;">
<h2>References</h2>
<p style="text-align: justify; margin-left: 28pt;"><span style="font-family: Times New Roman;">[Batini 1986] Batini, C. <em>et al</em>. <em>A comparative analysis of methodologies for database schema integration</em>, ACM Computing Surveys, (18) 4, 1986.<br />
</span></p>
<p style="text-align: justify; margin-left: 28pt;">
<p style="text-align: justify;">[Bernstein 2007] Bernstein P., Ho, H<span style="font-family: Arial;">. </span><em>Model Management and Schema Mappings: Theory and Practice</em>, Proceedings of VLDB Conference, 2007.</p>
<p style="text-align: justify; margin-left: 36pt;">
<p style="margin-left: 28pt;"><span style="font-family: Times New Roman;">[Halevy 2005] Halevy, A. <em>et al</em>. <em>Enterprise information integration: successes, challenges and controversies</em>, Proceedings of 24th International Conference on Management of Data, Baltimore, 2005.<br />
</span></p>
<p style="margin-left: 28pt;">
<p style="text-align: justify; margin-left: 36pt;">
<p style="text-align: justify; margin-left: 36pt;">[Northrop 2006]  Northrop, L., <em>et al.</em>, <em>Ultra-Large-Scale Systems The Software Challenge of the Future</em>, Pittsburgh: Carnegie Mellon University, 2007. <a href="http://www.sei.cmu.edu/publications/books/engineering/uls.html">http://www.sei.cmu.edu/publications/books/engineering/uls.html</a></p>
<p style="text-align: justify; margin-left: 36pt;">
<p style="text-align: justify; margin-left: 28pt;"><span style="font-family: Times New Roman;">[Parent 1998] Parent, C. and Spaccapietra, S. <em>Issues and approaches of database integration</em>, Communications of the ACM, 41(5), 1998.<br />
</span></p>
<p style="text-align: justify; margin-left: 36pt;">
<p style="text-align: justify; margin-left: 37pt;">[Yoakum 2008 DAMA] Yoakum-Stover, S. and Malyuta, T. <em>Unified Integration Architecture for Intelligence Data,</em> DAMA International Europe Conference 2008, November 2008, London, UK.</p>
<p style="margin-left: 36pt;">[Yoakum 2008 JDIQ] Yoakum-Stover, S. and Malyuta, T. <em>Unified Architecture for Integrating Intelligence Data,</em> ACM Journal of Data and Information Quality. September 2008. Pending decision.</p>
<img src="http://feeds.feedburner.com/~r/imintel/~4/o5JUfTNTbuQ" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://imintel.org/blog/2009/10/01/unified-data-integration-for-situation-management/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://imintel.org/blog/2009/10/01/unified-data-integration-for-situation-management/</feedburner:origLink></item>
	</channel>
</rss>
