<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:blogger='http://schemas.google.com/blogger/2008' xmlns:georss='http://www.georss.org/georss' xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-6862508</id><updated>2026-04-15T00:31:29.078+02:00</updated><category term="emacs"/><category term="c++"/><category term="book"/><category term="fp"/><category term="boost"/><category term="clojure"/><category term="databricks"/><category term="article"/><category term="file formats"/><category term="haskell"/><category term="vcs"/><category term="cedet"/><category term="msoffice"/><category term="programming"/><category term="content filtering"/><category term="mac"/><category term="spark"/><category term="work"/><category term="delta live tables"/><category term="dlt"/><category term="f#"/><category term="it"/><category term="life"/><category term="machine-learning"/><category term="scheme"/><category term="terraform"/><category term="asio"/><category term="cassandra"/><category term="datastax"/><category term="git"/><category term="pyspark"/><category term="zeppelin"/><category term="2010"/><category term="common-lisp"/><category term="cybersecurity"/><category term="devops"/><category term="eventhubs"/><category term="google"/><category term="hadoop"/><category term="job"/><category term="linux"/><category term="mooc"/><category term="oss"/><category term="testing"/><category term="astra"/><category term="delta lake"/><category term="dse"/><category term="edx"/><category term="erlang"/><category term="incanter"/><category term="kafka"/><category term="microsoft"/><category term="photography"/><category term="software development"/><category term="solaris"/><category term="2011"/><category term="2015"/><category term="DSL"/><category term="R"/><category term="acer"/><category term="algorithms"/><category term="cicd"/><category term="cmake"/><category term="confluent"/><category term="coursera"/><category term="cs"/><category term="cuda"/><category term="data mining"/><category term="dsefs"/><category term="eclipse"/><category term="education"/><category term="emulation"/><category term="fasttext"/><category term="germany"/><category term="gpu"/><category term="handheld"/><category term="hardware"/><category term="home"/><category term="humor"/><category term="information retrieval"/><category term="instant messaging"/><category term="internet"/><category term="java"/><category term="jenkins"/><category term="language-detection"/><category term="latex"/><category term="lisp"/><category term="lucene"/><category term="mahout"/><category term="mapreduce"/><category term="maven"/><category term="muse"/><category term="nlp"/><category term="ocaml"/><category term="opencl"/><category term="opensolaris"/><category term="opensource"/><category term="palm"/><category term="personal"/><category term="presentations"/><category term="quality"/><category term="ruby"/><category term="russia"/><category term="scala"/><category term="security"/><category term="sicp"/><category term="spirit"/><category term="squid"/><category term="tika"/><category term="tips"/><category term="travel"/><category term="ubuntu"/><category term="unix"/><category term="vacation"/><category term="version control"/><category term="video"/><category term="windows"/><title type='text'>Alex Ott&#39;s blog</title><subtitle type='html'>Blog dedicated to Software Development, Unixes, Content Filtering, Emacs, Lisp, and other things.</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default?redirect=false'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default?start-index=26&amp;max-results=25&amp;redirect=false'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>387</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-6862508.post-3286252514506508114</id><published>2026-04-10T12:56:00.000+02:00</published><updated>2026-04-10T12:56:16.767+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="terraform"/><title type='text'>Managing Databricks settings and previews using Terraform</title><content type='html'>&lt;p&gt;
Many Databricks customers use Terraform to create workspaces and deploy resources within them, or to create account-level resources.  But very often, there is a requirement not only to deploy resources but also to ensure that workspaces are correctly configured. I.e., many security-conscious customers disable the export of results from notebooks and SQL queries, and completely disable the embedding of dashboards into 3rd-party systems, etc.  And all these settings must be set without human involvement, especially for production environments.
&lt;/p&gt;

&lt;p&gt;
For a long time, people were using &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/workspace_conf&quot;&gt;databricks_workspace_conf&lt;/a&gt; and &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/sql_global_config&quot;&gt;databricks_sql_global_config&lt;/a&gt; resources to control some of the settings, but they had a few major problems:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Not all workspace settings were exposed via these resources;&lt;/li&gt;
&lt;li&gt;Many settings available via  &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/workspace_conf&quot;&gt;databricks_workspace_conf&lt;/a&gt; weren&#39;t publicly documented. Over time, many customers discovered setting names on their own, but it was not officially supported, as settings could be removed without notice.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
At some point, specific development teams began adding dedicated workspace- and account-level APIs to control specific settings, and corresponding Terraform resources were added. I.e., &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/disable_legacy_dbfs_setting&quot;&gt;databricks_disable_legacy_dbfs_setting&lt;/a&gt; (workspace-level), or &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/disable_legacy_features_setting&quot;&gt;databricks_disable_legacy_features_setting&lt;/a&gt; (account-level).  But this approach was unsustainable, as it led to resource sprawl and maintenance overhead.  And still, there were no possibilities to control previews or users&#39; preferences.
&lt;/p&gt;

&lt;p&gt;
The situation has changed with the introduction of the generic settings API for both &lt;a href=&quot;https://docs.databricks.com/api/workspace/settingsv2/getpublicworkspacesetting&quot;&gt;workspace&lt;/a&gt; and &lt;a href=&quot;https://docs.databricks.com/api/account/settingsv2/getpublicaccountsetting&quot;&gt;account&lt;/a&gt; levels, allowing the development teams to easily add new settings when necessary, and they are automatically exposed to users.  To work with those APIs, corresponding Terraform resources were added: &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/workspace_setting_v2&quot;&gt;databricks_workspace_setting_v2&lt;/a&gt; and  &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/account_setting_v2&quot;&gt;databricks_account_setting_v2&lt;/a&gt;.  And what is important, these resources could be used to configure Databricks previews on both workspace and account levels!
&lt;/p&gt;

&lt;p&gt;
The usage of those resources is quite simple:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Find the setting name using &lt;a href=&quot;https://docs.databricks.com/api/workspace/settingsv2/listworkspacesettingsmetadata&quot;&gt;workspace&lt;/a&gt; or &lt;a href=&quot;https://docs.databricks.com/api/account/settingsv2/listaccountsettingsmetadata&quot;&gt;account&lt;/a&gt;-level APIs.&lt;/li&gt;
&lt;li&gt;Create an instance of workspace or account-level resource using that setting name as an argument, and specify the required value argument.  The actual value argument depends on the specific setting - it could be a primitive value: &lt;code&gt;boolean_val&lt;/code&gt;, &lt;code&gt;integer_val&lt;/code&gt;, &lt;code&gt;string_val&lt;/code&gt;, or it could be a complex value, i.e. &lt;code&gt;automatic_cluster_update_workspace&lt;/code&gt;, &lt;code&gt;aibi_dashboard_embedding_approved_domains&lt;/code&gt;, etc. (check resource documentation for more details).&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
I.e., I want to enable preview for &quot;Lakeflow Connect for Jira&quot;, that is, workspace-level preview:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Via list settings API, I find that the setting has the &lt;code&gt;jira_connector&lt;/code&gt; name and it&#39;s&lt;/li&gt;
&lt;/ol&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-json&quot;&gt;&lt;code&gt;{&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: #a020f0;&quot;&gt;&quot;description&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;Ingest Jira data with a simple and efficient connector. 
Available via API for both Jira Cloud and on premise instances.&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: #a020f0;&quot;&gt;&quot;name&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;jira_connector&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: #a020f0;&quot;&gt;&quot;type&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;{\&quot;boolean_val\&quot;: {\&quot;value\&quot;: true}}&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Add a corresponding resource to my Terraform code:&lt;/li&gt;
&lt;/ol&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-hcl&quot;&gt;&lt;code&gt;resource &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;databricks_workspace_setting_v2&quot;&lt;/span&gt; &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;jira&quot;&lt;/span&gt; {&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;name&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;jira_connector&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;boolean_val&lt;/span&gt; = {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;value&lt;/span&gt; =&lt;span style=&quot;color: darkcyan;&quot;&gt; true&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;  }&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Do standard &lt;code&gt;terraform plan&lt;/code&gt;, &lt;code&gt;terraform apply&lt;/code&gt; to apply setting change.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
And I can see in the UI that it&#39;s flipped. 
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjL63DAjbWRCnjFpRYa5rIiG8JHmOx7elAXlIprSiZfe8NUhU3DFIvLcAVoLt77XzxXQE2EnBClr5boSNJXPQ1nAVE7nlMKJzETwxOKRTK1STklF_S1D_np_SjMPNUbjLsSk9XmzgIqUMcGSsMDjLdXSttWYCljT2k9xxKvcbZOLidStYVcED36Kg/s725/Screenshot%202026-04-10%20at%2012.35.55.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;71&quot; data-original-width=&quot;725&quot; height=&quot;63&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjL63DAjbWRCnjFpRYa5rIiG8JHmOx7elAXlIprSiZfe8NUhU3DFIvLcAVoLt77XzxXQE2EnBClr5boSNJXPQ1nAVE7nlMKJzETwxOKRTK1STklF_S1D_np_SjMPNUbjLsSk9XmzgIqUMcGSsMDjLdXSttWYCljT2k9xxKvcbZOLidStYVcED36Kg/w640-h63/Screenshot%202026-04-10%20at%2012.35.55.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Similarly, it could be done on the account level.
&lt;/p&gt;

&lt;p&gt;
As of right now, we need to keep in mind a few things when using these resources:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Not all settings are available yet in the new API, as migration is still in progress.&lt;/li&gt;
&lt;li&gt;Deletion of a setting is a &lt;b&gt;no-op&lt;/b&gt; - it won&#39;t revert the setting to the original value.  So if you want to disable preview or revert another setting to the original value, you need to do it explicitly.&lt;/li&gt;
&lt;/ul&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/3286252514506508114/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/3286252514506508114' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/3286252514506508114'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/3286252514506508114'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2026/04/managing-databricks-settings-and.html' title='Managing Databricks settings and previews using Terraform'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjL63DAjbWRCnjFpRYa5rIiG8JHmOx7elAXlIprSiZfe8NUhU3DFIvLcAVoLt77XzxXQE2EnBClr5boSNJXPQ1nAVE7nlMKJzETwxOKRTK1STklF_S1D_np_SjMPNUbjLsSk9XmzgIqUMcGSsMDjLdXSttWYCljT2k9xxKvcbZOLidStYVcED36Kg/s72-w640-h63-c/Screenshot%202026-04-10%20at%2012.35.55.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-5921215364937510703</id><published>2025-12-31T18:52:00.000+01:00</published><updated>2025-12-31T18:52:29.948+01:00</updated><title type='text'>Traditional New Year post, 2025th edition</title><content type='html'>&lt;p&gt;
It&#39;s the last day of the year, and it&#39;s time for a traditional blog post.
&lt;/p&gt;

&lt;p&gt;
As usual, it was quite busy at work this year - many different customers, different tasks on different topics, and a lot of different internal activities. Although I got a possibility to concentrate more on team upskilling (internal presentations, trainings, etc.), development and maintenance of reusable assets, development of different tooling for migrations, organizing/overseeing work of my colleagues, working more closely with different product teams, etc.  And this year, after five years at Databricks, I was promoted to Principal SSA, and I&#39;m very thankful to my managers for their support throughout that journey.
&lt;/p&gt;

&lt;p&gt;
Early this year, we released the &lt;a href=&quot;https://databrickslabs.github.io/dqx/&quot;&gt;DQX project&lt;/a&gt; into Databricks Labs (thank you, Marcin &amp;amp; the team) - the data quality library originally written almost five years ago.  We were really surprised by how fast customers started to adopt it in their data processing pipelines. This growth allowed us to invest even more time in developing the new functionality.  We are also working directly with the Data Quality monitoring product team to ensure that the official product incorporates all learnings from the field.  I even had the opportunity to stay &lt;a href=&quot;https://www.databricks.com/dataaisummit/session/elevating-data-quality-standards-databricks-dqx&quot;&gt;on stage at the Data and AI summit with Marcin and Neha (a big thank you!)&lt;/a&gt; talking about DQX.
&lt;/p&gt;

&lt;p&gt;
For the first time, I visited the Data and AI summit in San Francisco.  It was a very interesting experience, talking with so many people in different formats (braindates, customer product meetings, booth, …).  Although I feel that I needed at least one more week to catch up with my colleagues :-)
&lt;/p&gt;

&lt;p&gt;
This year, the work on Terraform continued in different forms.  ~250 pull requests were merged into Databricks Terraform provider - new functionality, bug fixes, etc. A lot of work was done on the &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter&quot;&gt;Terraform exporter&lt;/a&gt;, which is heavily used by Databricks customers for migrations, disaster recovery, or to start their own Terraform journey (I need to write a separate blog post about the exporter and the challenges of reconstructing deployed resources).  Besides internal trainings, Vuong and I recorded a &lt;a href=&quot;https://customer-academy.databricks.com/learn/course/4264/managing-databricks-at-scale-using-terraform&quot;&gt;webinar about using Terraform&lt;/a&gt; for deploying Databricks resources at scale (it&#39;s already available in the customer academy - you can watch it even as a user of Free Edition).  I even got recognized by Hashicorp as a Hashicorp Core Contributor 2025 - primarily for our work that I described in a &lt;a href=&quot;https://alexott.blogspot.com/2024/12/working-with-huge-terraform-states.html&quot;&gt;separate blog post&lt;/a&gt;.
&lt;/p&gt;

&lt;p&gt;
Adoption of LLMs for work significantly grew this year - it went from occasional use of Copilot for programming to use of a mix of Claude &amp;amp; Cursor for programming,  Glean, Perplexity, and custom agents for working with documents, more efficient search for information, understanding new stuff, etc.  On the programming side, I often feel like in this &lt;a href=&quot;https://www.facebook.com/groups/it.humor.and.memes/posts/33558693033729726/&quot;&gt;meme&lt;/a&gt; (even as I learn new stuff, I&#39;m still far from very advanced user):
&lt;/p&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgPkffg_RbbSFSlHxlfM7dCCvp5WTdAzpTJQ4brK55Vpkcz_SD27nBbZZ15yrJ9bWuiivXGH3LckF_ObHhcRua49yUzXzCYZ1T6MgtlUWOWVbA4XNGd1UE-WNhyphenhyphenzhyphenhyphenRiPYlI0_2TsJtl7yJdlO3nLjWZ92r3uyY3uCq96ZjjU6a9roXemc21bxFNA/s499/1.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;327&quot; data-original-width=&quot;499&quot; height=&quot;263&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgPkffg_RbbSFSlHxlfM7dCCvp5WTdAzpTJQ4brK55Vpkcz_SD27nBbZZ15yrJ9bWuiivXGH3LckF_ObHhcRua49yUzXzCYZ1T6MgtlUWOWVbA4XNGd1UE-WNhyphenhyphenzhyphenhyphenRiPYlI0_2TsJtl7yJdlO3nLjWZ92r3uyY3uCq96ZjjU6a9roXemc21bxFNA/w400-h263/1.png&quot; width=&quot;400&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Agents allow me to concentrate on writing a specification, helping with researching a specific topic, doing the code reviews, and offloading the boring stuff, like writing tests to an agent (or a swarm of different agents).  With new LLM tools, I&#39;ve significantly reduced the number of open TODO items, and some of them were quite complex, so I was always waiting to find more time to work on them.  The new tools helped me build a lot of new functionality in the Terraform exporter. I.e., implementing support for plugin framework allowed me to increase exported resource coverage to almost all resources available in the Terraform provider, or LLMs helped me to implement a functionality for cross-cloud resource migration, rewriting cloud attributes and instance types.  Besides Terraform work, it helped me a lot in designing and writing code in other areas - cybersecurity-related, migration tooling, etc.  In the new year, I plan to continue investing in learning new patterns to enhance my work efficiency.
&lt;/p&gt;

&lt;p&gt;
Cybersecurity is my favorite topic, especially when it comes to big data.  This year, we continued to help customers adopt Databricks for their cybersecurity needs.  And we see more and more customers doing that at scale - you can watch a number of presentations at the Data and AI summit on the topic of cybersecurity.  And new product features, like &lt;a href=&quot;https://alexott.blogspot.com/2025/03/effective-use-of-latest-dlt-features.html&quot;&gt;new stuff in declarative pipelines&lt;/a&gt; help to implement use cases faster and more efficiently.  Another significant topic we observe is the adoption of LLMs and Agents for cybersecurity use cases on Databricks.  And some of the results are very impressive - AgentBricks in combination with Genie allows not only to understand what happens from the data, but also to generate mitigation procedures based on existing runbooks, or even call mitigation tools automatically.&amp;nbsp;&lt;/p&gt;&lt;p&gt;&amp;nbsp;&lt;/p&gt;


&lt;p&gt;
I wish everyone a healthy and prosperous New Year!
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/5921215364937510703/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/5921215364937510703' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5921215364937510703'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5921215364937510703'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2025/12/traditional-new-year-post-2025th-edition.html' title='Traditional New Year post, 2025th edition'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgPkffg_RbbSFSlHxlfM7dCCvp5WTdAzpTJQ4brK55Vpkcz_SD27nBbZZ15yrJ9bWuiivXGH3LckF_ObHhcRua49yUzXzCYZ1T6MgtlUWOWVbA4XNGd1UE-WNhyphenhyphenzhyphenhyphenRiPYlI0_2TsJtl7yJdlO3nLjWZ92r3uyY3uCq96ZjjU6a9roXemc21bxFNA/s72-w400-h263-c/1.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-7155070382133752516</id><published>2025-05-29T12:38:00.001+02:00</published><updated>2025-07-03T09:07:11.749+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta live tables"/><category scheme="http://www.blogger.com/atom/ns#" term="dlt"/><category scheme="http://www.blogger.com/atom/ns#" term="eventhubs"/><title type='text'>Delta Live Tables recipes: Consuming from Azure Event Hubs using Unity Catalog Service Credentials</title><content type='html'>&lt;p&gt;
I &lt;a href=&quot;https://alexott.blogspot.com/search/label/eventhubs&quot;&gt;wrote previously&lt;/a&gt; on different methods of connection from Delta Live Tables to Azure Event Hubs, but both of them suffer from a common problem - they need either a service principal secret or a Shared Access Signature (SAS), which are long-living credentials that could be potentially leaked and used outside of the pipeline.
&lt;/p&gt;

&lt;p&gt;
Several months ago, Databricks introduced &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/databricks/connect/unity-catalog/cloud-services/service-credentials&quot;&gt;Unity Catalog Service Credentials&lt;/a&gt; that are based on a special type of managed identity called Azure Databricks access connector.  Service credentials allow the generation of short-lived authentication tokens and connect to different Azure services without requiring passwords or other long-lived credentials.  And they are managed by Unity Catalog, so you can limit who can use them, or allow their usage only from specific workspaces (s).  All of this heavily improves the security posture.
&lt;/p&gt;

&lt;p&gt;
Although we could already connect to different services with generated authentication tokens, we still could do this in the Kafka connector out of the box, as the authentication flow is handled by Kafka itself.  But recently, this problem was fixed, and now we can authenticate to Azure Event Hubs using &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/databricks/connect/streaming/kafka#service-cred&quot;&gt;UC Service Credentials&lt;/a&gt;.  Support for it is rolled out in Databricks Runtime 16.1+ (serverless support is coming soon), and available in Delta Live Tables preview channel that is based on DBR 16.1 (both serverless and classic compute).
&lt;/p&gt;

&lt;p&gt;
And it&#39;s the easiest way to authenticate to Event Hubs:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Create &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/databricks/connect/unity-catalog/cloud-services/service-credentials&quot;&gt;UC Service Credential&lt;/a&gt; if you don&#39;t have one.&lt;/li&gt;
&lt;li&gt;Assign &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/event-hubs/authorize-access-azure-active-directory#azure-built-in-roles-for-azure-event-hubs&quot;&gt;necessary roles&lt;/a&gt; to it on Event Hubs (i.e., &lt;code&gt;Azure Event Hubs Data receiver&lt;/code&gt;, &lt;code&gt;Azure Event Hubs Data sender&lt;/code&gt;, etc.)&lt;/li&gt;
&lt;li&gt;Specify the service credential name in the&amp;nbsp;&lt;code&gt;databricks.serviceCredential&lt;/code&gt; option when reading or writing data.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
&lt;b&gt;That&#39;s all!
&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;
We can check that it works in the notebook attached to a cluster running DBR 16.4:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;credential_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;service-credential&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;eh_server&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;&amp;lt;host&amp;gt;.servicebus.windows.net:9093&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;eh_opts&lt;/span&gt; = {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;databricks.serviceCredential&quot;&lt;/span&gt;: credential_name,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: eh_server,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;subscribe&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;iocs&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;startingOffsets&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;earliest&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = spark.readStream.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;).options(**eh_opts).load()&lt;/code&gt;
&lt;code&gt;display(df.selectExpr(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;CAST(value AS STRING) as value&quot;&lt;/span&gt;))&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And we can see the data read from the topic:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgHCfV9CxB_Qsks3fT46UVyiHvz-uCZJ9DSgYTAmimCFg2RxbUyjktXvlj44UkB_nmquydnBzdTxzCtiSeEydlozQiD0Rb1NnQDvmFT66yP9SU9w4ARUJwWaFk9vgjjkswyIjDm3ZvjpkK9wGojkbagVe8ZMXgqqUvDEU8hAu9SAaPqZgZPoZXp9g/s1131/EH-UC-Credentials.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;775&quot; data-original-width=&quot;1131&quot; height=&quot;438&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgHCfV9CxB_Qsks3fT46UVyiHvz-uCZJ9DSgYTAmimCFg2RxbUyjktXvlj44UkB_nmquydnBzdTxzCtiSeEydlozQiD0Rb1NnQDvmFT66yP9SU9w4ARUJwWaFk9vgjjkswyIjDm3ZvjpkK9wGojkbagVe8ZMXgqqUvDEU8hAu9SAaPqZgZPoZXp9g/w640-h438/EH-UC-Credentials.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
DLT supports service credentials as well, both for reading data with &lt;code&gt;spark.readStream&lt;/code&gt;, and writing via &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/databricks/dlt/dlt-sinks&quot;&gt;DLT Sinks&lt;/a&gt;:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;credential_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;service-credential&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;eh_server&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;&amp;lt;host&amp;gt;.servicebus.windows.net:9093&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Read data from Event Hubs&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: forestgreen;&quot;&gt;@dlt.table&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;raw_iocs&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;eh_opts&lt;/span&gt; = {&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;databricks.serviceCredential&quot;&lt;/span&gt;: credential_name,&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: eh_server,&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;subscribe&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;iocs&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;startingOffsets&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;earliest&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    }&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = spark.readStream.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;).options(**eh_opts).load()&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Create a write sink&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;dlt.create_sink(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;eventhubs&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;, {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;databricks.serviceCredential&quot;&lt;/span&gt;: credential_name,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: eh_server,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;topic&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;altest&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;  }&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Actual data writer&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: forestgreen;&quot;&gt;@dlt.append_flow&lt;/span&gt;(&lt;/code&gt;
&lt;code&gt;    name=&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;write_back&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    target=&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;eventhubs&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;write_back&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = dlt.read_stream(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;raw_iocs&quot;&lt;/span&gt;).select(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;value&quot;&lt;/span&gt;)&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And if we run that pipeline, we&#39;ll see both read and written data (visible in the Azure portal):
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjrROOA3Ccm1himzowIUgKcYFCJs0rFBlOrEQHVcfAmNpgZ7EyNqPixmoivMnIsLL1WHUs5tLMe4OWKTRuGfjWd9fXMrQCldyRSVadvltKS8plyz5mAReL5w4geNypDUk-tQtMyPYiHYt_ktz14g8Ud0T0pEPK1xDST7w3pmuIsBuFI7F-OE2qBCQ/s1536/EH-UC-Credentials-DLT.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;1037&quot; data-original-width=&quot;1536&quot; height=&quot;432&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjrROOA3Ccm1himzowIUgKcYFCJs0rFBlOrEQHVcfAmNpgZ7EyNqPixmoivMnIsLL1WHUs5tLMe4OWKTRuGfjWd9fXMrQCldyRSVadvltKS8plyz5mAReL5w4geNypDUk-tQtMyPYiHYt_ktz14g8Ud0T0pEPK1xDST7w3pmuIsBuFI7F-OE2qBCQ/w640-h432/EH-UC-Credentials-DLT.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
So, if you need to connect to Azure Event Hubs from Databricks, I recommend starting to use service credentials instead of service principal or SAS authentication.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/7155070382133752516/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/7155070382133752516' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7155070382133752516'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7155070382133752516'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2025/05/delta-live-tables-recipes-consuming.html' title='Delta Live Tables recipes: Consuming from Azure Event Hubs using Unity Catalog Service Credentials'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgHCfV9CxB_Qsks3fT46UVyiHvz-uCZJ9DSgYTAmimCFg2RxbUyjktXvlj44UkB_nmquydnBzdTxzCtiSeEydlozQiD0Rb1NnQDvmFT66yP9SU9w4ARUJwWaFk9vgjjkswyIjDm3ZvjpkK9wGojkbagVe8ZMXgqqUvDEU8hAu9SAaPqZgZPoZXp9g/s72-w640-h438-c/EH-UC-Credentials.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-2150254748307239520</id><published>2025-03-03T10:11:00.003+01:00</published><updated>2025-07-03T09:07:18.340+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cybersecurity"/><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta live tables"/><category scheme="http://www.blogger.com/atom/ns#" term="dlt"/><title type='text'>Efficient use of the latest DLT features for cybersecurity use cases</title><content type='html'>&lt;p&gt;
In cybersecurity, everything starts with the collection and processing of data from multiple data sources.  These data should be parsed, and then converted into a normalized form, matching some common information model, such as, &lt;a href=&quot;https://ocsf.io/&quot;&gt;OCSF (Open Cybersecurity Schema Framework)&lt;/a&gt;.  Typically, the data is organized into several categories, such as Network activity, Identity and Access Management, etc.  After that, this data could be used for threat hunting or performing automated detection and response - the common schema helps a lot because we can apply the same detections and queries to data from multiple data sources.  We can depict that activity as follows:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi1dcpFmeejhbIHZp3XLKVVe3mKSsTzK3iA-gol8o35z-GrK0N-AQDFUGI9KfIqARkXSAmYMcbbaOrFRVVuKvgl7bpJO5vB3DBo02FnD3Dg9GPn2Vr9IZx4DG_6kh-SBE3f8fPOHnjQ_W0XOIBU079zmgC15NMn7iPF-liEymSEWGghyU6GkmVBdQ/s1237/cyber-pipeline-general.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;655&quot; data-original-width=&quot;1237&quot; height=&quot;338&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi1dcpFmeejhbIHZp3XLKVVe3mKSsTzK3iA-gol8o35z-GrK0N-AQDFUGI9KfIqARkXSAmYMcbbaOrFRVVuKvgl7bpJO5vB3DBo02FnD3Dg9GPn2Vr9IZx4DG_6kh-SBE3f8fPOHnjQ_W0XOIBU079zmgC15NMn7iPF-liEymSEWGghyU6GkmVBdQ/w640-h338/cyber-pipeline-general.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
It all looks nice on the picture, but implementing efficient and scalable data ingestion and normalization at scale is quite a challenge because we need to handle dozens of different data sources that often use different data formats, there are spikes in the log volumes, i.e., when people come to the office, etc. Plus we need to be cost-efficient and have a good balance between the amount of provisioned resources and data processing latency.  Very often, when we use Apache Spark on Databricks to process security logs, people try to combine multiple streaming pipelines inside the single job to get more efficient cluster resource usage, but it leads to more complexity due to the need to handle dependencies between multiple streams, handle errors, and restart of individual streams inside the single job.
&lt;/p&gt;

&lt;p&gt;
The &lt;a href=&quot;https://www.databricks.com/product/data-engineering/delta-live-tables&quot;&gt;Delta Live Tables (DLT)&lt;/a&gt; is a great tool for data ingestion, transformation, and normalization.  The declarative nature of DLT pipelines made it easier to write data processing pipelines - very often we can come up with some generic implementations driven by a config. &lt;a href=&quot;https://www.databricks.com/blog/2022/12/08/build-reliable-and-cost-effective-streaming-data-pipelines.html&quot;&gt;Enhanced Autoscaling&lt;/a&gt; allows to handle data spikes, automatically scale clusters up and down, providing a right balance between cost and data processing latency. DLT is also well integrated with &lt;a href=&quot;https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/&quot;&gt;Databricks Auto Loader&lt;/a&gt; to efficiently ingest data in different formats from the cloud storage. Other features, such as expectations, automatic maintenance, and simplified observability, allow us to build and simplify the maintenance and make sure that we have correct data in our tables. 
&lt;/p&gt;

&lt;p&gt;
For some time, DLT had some limitations that required careful planning of an implementation.  For example:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Tables created by the DLT pipeline are fully owned by a specific pipeline, and it was not possible to write to the same table from other pipelines. This made work with normalized data more complex, as only a single pipeline could be used for writing to it.&lt;/li&gt;
&lt;li&gt;We were able to write data only to the Delta tables, so if we would like to push detection data to external destinations (Kafka, Splunk, Microsoft Sentinel, etc.), we should have a separate job that just reads new data and writes it into the corresponding destination, or tinker with &lt;code&gt;mapInPandas&lt;/code&gt; and similar things.&lt;/li&gt;
&lt;li&gt;All tables created by the DLT pipeline were stored under the same schema. This made it more complex to maintain permissions, as we typically give wide access only to normalized data (gold), leaving access to bronze (raw data) and silver (decoded, but not normalized data) layers only to a smaller audience (i.e., data engineers).&lt;/li&gt;
&lt;li&gt;When multiple source directories were present for the same dataset, we needed to combine them into a single stream using the &lt;code&gt;union&lt;/code&gt; function, but Spark Structured Streaming has some specific rules about adding and removing new stream sources, and it was not easy to handle that correctly.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
However, during the last year, the DLT product team implemented a lot of new functionality and reduced the number of limitations, making it much easier to develop complex data processing pipelines. I prepared a demo of all of this new stuff. You can find full source code and setup instructions in the &lt;a href=&quot;https://github.com/alexott/databricks-cybersecurity-playground/tree/main/dlt_modern_stuff&quot;&gt;repository&lt;/a&gt;.
&lt;/p&gt;

&lt;h3 id=&quot;orgd4d4a22&quot;&gt;Writing to the same table from multiple streams&lt;/h3&gt;

&lt;p&gt;
Let&#39;s start with the last item - this problem was fixed more than half a year ago with the introduction of &lt;a href=&quot;https://docs.databricks.com/aws/en/delta-live-tables/flows&quot;&gt;append flows&lt;/a&gt;.  With append flows you can easily add or remove data sources that are used to populate a defined streaming table without the need to do a full refresh - this is especially useful when source data has a short retention.  It&#39;s very easy to use - you just define a destination table, and then define one or more functions that will be used as append flows. It&#39;s very easy to combine with the meta-programming approach, allowing to define a function that will return an append flow parameterized by something, i.e. source path for the data (the only requirement is that each flow should have a unique name).  For example, here is how we can &lt;a href=&quot;https://github.com/alexott/databricks-cybersecurity-playground/blob/main/dlt_modern_stuff/src/ingest_apache_web.py&quot;&gt;ingest and parse log data&lt;/a&gt; for Apache and Nginx web servers that use the same log file format:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Define the streaming table to which we&#39;ll write&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;dlt.create_streaming_table(&lt;/code&gt;
&lt;code&gt;    name=apache_web_table_name,&lt;/code&gt;
&lt;code&gt;    comment=&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;Table for data parsed from Apache HTTP server-compatible logs&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;read_apache_web&lt;/span&gt;(&lt;span style=&quot;color: darkslateblue;&quot;&gt;input&lt;/span&gt;: &lt;span style=&quot;color: darkslateblue;&quot;&gt;str&lt;/span&gt;, add_opts: Optional[&lt;span style=&quot;color: darkslateblue;&quot;&gt;dict&lt;/span&gt;] = &lt;span style=&quot;color: darkcyan;&quot;&gt;None&lt;/span&gt;) -&amp;gt; DataFrame:&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;read data, parse, and convert into the DataFrame&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;create_apache_web_flow&lt;/span&gt;(&lt;span style=&quot;color: darkslateblue;&quot;&gt;input&lt;/span&gt;: &lt;span style=&quot;color: darkslateblue;&quot;&gt;str&lt;/span&gt;, add_opts: Optional[&lt;span style=&quot;color: darkslateblue;&quot;&gt;dict&lt;/span&gt;] = &lt;span style=&quot;color: darkcyan;&quot;&gt;None&lt;/span&gt;):&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #3a5fcd;&quot;&gt;@dlt.append_flow&lt;/span&gt;(&lt;/code&gt;
&lt;code&gt;        name=f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;apache_web_&lt;/span&gt;{sanitize_string_for_flow_name(&lt;span style=&quot;color: darkslateblue;&quot;&gt;input&lt;/span&gt;)}&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;        target=apache_web_table_name,&lt;/code&gt;
&lt;code&gt;        comment=f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;Ingesting from &lt;/span&gt;{&lt;span style=&quot;color: darkslateblue;&quot;&gt;input&lt;/span&gt;}&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    )&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;flow&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; read_apache_web(&lt;span style=&quot;color: darkslateblue;&quot;&gt;input&lt;/span&gt;, add_opts)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Handling of Apache Web logs&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;create_apache_web_flow(apache_web_input)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Handling of NGINX logs (compatible with Apache Web)&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;create_apache_web_flow(nginx_input)&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
If we run this code, then we&#39;ll see a single table inside the graph, but if we select it, and navigate to the Flows tab, then we&#39;ll see that it&#39;s populated from two data sources:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh85OmHGDHTy5Ae4Njo2k-eYg6VPEVmJZDSSzhneloYHg6_Ge7n7nPpi2qE7gygM5_DVlJrLdkfhV9C8Sf_OVITouVt9C4XfQr_g9HlcuPfv2XmMqll0liTRRJydrgZ9-h5XHpjNeykHmyIQkERapPcGti8S7xS1IqrcK6GT_4-O42B2CkRyy0XIg/s861/cyber-pipeline-append-flows.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;861&quot; data-original-width=&quot;565&quot; height=&quot;640&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh85OmHGDHTy5Ae4Njo2k-eYg6VPEVmJZDSSzhneloYHg6_Ge7n7nPpi2qE7gygM5_DVlJrLdkfhV9C8Sf_OVITouVt9C4XfQr_g9HlcuPfv2XmMqll0liTRRJydrgZ9-h5XHpjNeykHmyIQkERapPcGti8S7xS1IqrcK6GT_4-O42B2CkRyy0XIg/w420-h640/cyber-pipeline-append-flows.png&quot; width=&quot;420&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;h3&gt;Publishing to tables in different UC catalogs and schemas&lt;/h3&gt;

&lt;p&gt;
Previously, when we created tables inside the DLT pipeline, they were all stored under the UC schema configured in the &lt;code&gt;catalog&lt;/code&gt; and &lt;code&gt;target&lt;/code&gt; configuration parameters. However, this wasn&#39;t always desired, as it made permissions management more complex, especially when a single DLT pipeline produced tables for all layers of &lt;a href=&quot;https://www.databricks.com/glossary/data-lakehouse&quot;&gt;Lakehouse architecture&lt;/a&gt; (bronze/silver/gold).
&lt;/p&gt;

&lt;p&gt;
But now &lt;a href=&quot;https://docs.databricks.com/aws/en/delta-live-tables/target-schema#target-a-dataset-in-a-different-catalog-or-schema&quot;&gt;it&#39;s possible to specify where specific table goes&lt;/a&gt; - it depends on if the table has a simple name, then it will be put into &lt;code&gt;&amp;lt;default-catalog&amp;gt;.&amp;lt;default-schema&amp;gt;.&amp;lt;name&amp;gt;&lt;/code&gt; (default catalog and schema are defined on pipeline level), or it will go into &lt;code&gt;&amp;lt;default-catalog&amp;gt;.&amp;lt;schema&amp;gt;.&amp;lt;name&amp;gt;&lt;/code&gt; if it has form of &lt;code&gt;&amp;lt;schema&amp;gt;.&amp;lt;name&amp;gt;&lt;/code&gt;, or we can use a fully qualified name like &lt;code&gt;&amp;lt;catalog&amp;gt;.&amp;lt;schema&amp;gt;.&amp;lt;name&amp;gt;&lt;/code&gt;. But anyway it&#39;s best to avoid hardcoding of catalog and schema names, and instead either rely on catalog and schema names specified on the pipeline level or pass catalog and schema names as configuration of the pipeline.
&lt;/p&gt;

&lt;p&gt;
In the provided project I&#39;m passing catalog and schema names for normalized data explicitly, and then forming the fully qualified name like this (it&#39;s done by &lt;code&gt;get_qualified_table_name&lt;/code&gt; function that I defined in &lt;code&gt;helpers.py&lt;/code&gt;):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;name&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;test&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;catalog&lt;/span&gt; = spark.conf.get(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;gold_catalog&quot;&lt;/span&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;schema&lt;/span&gt; = spark.conf.get(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;gold_schema&quot;&lt;/span&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;table_name&lt;/span&gt; = f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;{catalog}&lt;span style=&quot;color: #008b00;&quot;&gt;.&lt;/span&gt;{schema}&lt;span style=&quot;color: #008b00;&quot;&gt;.&lt;/span&gt;{name}&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
while for silver tables I use catalog and schema configured on the pipeline level:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjngaa2ZL2i1Y6WhiVtn62yt7NZv8E_80LFv2v8nw_bhub2molSIa7Px9_p7Z5jUH_qV3jTowDjPet0v9fsbxbIO-UVrizaGYndgq-CyhDhfkyf5r08gOvTKemnJFvxJv2DiyQ9imjJwz6DqUlVubsBzWVB66iDZnoRiT6GhnPjzBU_pVop3Bcnkg/s749/cyber-pipeline-dpm.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;405&quot; data-original-width=&quot;749&quot; height=&quot;346&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjngaa2ZL2i1Y6WhiVtn62yt7NZv8E_80LFv2v8nw_bhub2molSIa7Px9_p7Z5jUH_qV3jTowDjPet0v9fsbxbIO-UVrizaGYndgq-CyhDhfkyf5r08gOvTKemnJFvxJv2DiyQ9imjJwz6DqUlVubsBzWVB66iDZnoRiT6GhnPjzBU_pVop3Bcnkg/w640-h346/cyber-pipeline-dpm.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;h3&gt;Using sinks to write to non-Delta destinations or from multiple pipelines&lt;/h3&gt;

&lt;p&gt;
The first two items from the list of limitations above were the most critical for cybersecurity use cases - it should be easy to write to normalized tables from multiple pipelines or write to external systems without additional jobs or complex code.
&lt;/p&gt;

&lt;p&gt;
And now it&#39;s possible with recently released &lt;a href=&quot;https://docs.databricks.com/aws/en/delta-live-tables/dlt-sinks&quot;&gt;DLT Sinks API&lt;/a&gt; - you can define a sink object pointing to a Delta table defined outside of the pipeline, or even to another data format supporting streaming writes, such as Kafka or even custom data sources.  The usage is very similar to append flows - just instead of a streaming table you define a sink object, and then use it as a target for append flow functions.   For example, here is how we can write to the same Delta table from multiple pipelines:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Create a sink&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;sink_name&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;http_normalized&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;dlt.create_sink(sink_name, &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;delta&quot;&lt;/span&gt;, {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;tableName&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;my_catalog.my_schema.http&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;mergeSchema&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;true&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;  }&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;This is in one pipeline&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #3a5fcd;&quot;&gt;@dlt.append_flow&lt;/span&gt;(&lt;/code&gt;
&lt;code&gt;    name=&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;apache_web_normalized&quot;&lt;/span&gt;, &lt;/code&gt;
&lt;code&gt;    target=sink_name&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;write_normalized&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = dlt.read_stream(apache_web_table_name)&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = ... &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;transform data to a normalized form&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;This is in another pipeline&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #3a5fcd;&quot;&gt;@dlt.append_flow&lt;/span&gt;(&lt;/code&gt;
&lt;code&gt;    name=&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;zeek_normalized&quot;&lt;/span&gt;, &lt;/code&gt;
&lt;code&gt;    target=sink_name&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;write_normalized&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = dlt.read_stream(zeek_http_table_name)&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = ... &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;transform data to a normalized form&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
If we want to write to non-Delta destinations, we need to provide all the necessary options for that connector.  For example, here is how you can define a sink for Azure Event Hubs using the Kafka connector bundled with DLT and then write to it using the append flow:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;dlt.create_sink(&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;alerts_eventhubs&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    { &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Create Kafka options dictionary for connection with OAuth authentication&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;{eh_server}&lt;span style=&quot;color: #008b00;&quot;&gt;:9093&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;        &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;topic&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;alerts&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;        ....&lt;/code&gt;
&lt;code&gt;    }&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #3a5fcd;&quot;&gt;@dlt.append_flow&lt;/span&gt;(&lt;/code&gt;
&lt;code&gt;    name=&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;alerts&quot;&lt;/span&gt;, &lt;/code&gt;
&lt;code&gt;    target=&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;alerts_eventhubs&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;write_alerts&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = dlt.read_stream(detections_table_name)&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;df&lt;/span&gt; = ... &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;transform data to a format supported by Kafka&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And when it&#39;s executed, we can see the data pushed to Azure Event Hubs topic:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgShvZcX16Kw9CvtiOLsg_VrWEsRH4b-VNOoh8y-HQVQAODtyFtVnXFYpEoSV3PE7u0x0dKxh3UYCPFlBdbBdHbpU0UIfhEwQ1ufZtN3E6ZLIo3QlJPLUM7UWgzh2ZHQDzIIRF9WEQOQ7T-9WSuGJHeWShp1YwL4FpiWbS8gjS4bZUujQ8GuFK3cg/s1133/cyber-pipeline-eventhubs.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;313&quot; data-original-width=&quot;1133&quot; height=&quot;176&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgShvZcX16Kw9CvtiOLsg_VrWEsRH4b-VNOoh8y-HQVQAODtyFtVnXFYpEoSV3PE7u0x0dKxh3UYCPFlBdbBdHbpU0UIfhEwQ1ufZtN3E6ZLIo3QlJPLUM7UWgzh2ZHQDzIIRF9WEQOQ7T-9WSuGJHeWShp1YwL4FpiWbS8gjS4bZUujQ8GuFK3cg/w640-h176/cyber-pipeline-eventhubs.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;Technically, the sinks implemented using custom data source APIs (&lt;a href=&quot;https://alexott.blogspot.com/2024/11/spark-custom-data-sources-and-sinks-for.html&quot;&gt;I wrote about them a few months ago&lt;/a&gt;) are supported as well, although there are still limitations related to their support on serverless and custom libraries inside UC UDFs, but they should be fixed soon.  But this code works on non-serverless pipeline:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;splunk_opts&lt;/span&gt; = {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;url&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;http://10.1.0.6:8088/services/collector/event&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;token&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;splunk_hec_token&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;time_column&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;detection_time&quot;&lt;/span&gt;,&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;code&gt;dlt.create_sink(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;splunk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;splunk&quot;&lt;/span&gt;, splunk_opts)&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #3a5fcd;&quot;&gt;@dlt.append_flow&lt;/span&gt;(name = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;write_to_splunk&quot;&lt;/span&gt;, target = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;splunk&quot;&lt;/span&gt;)&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;flowFunc&lt;/span&gt;():&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; dlt.read_stream(detections_table_name)&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And we can see the data in the Splunk interface:&amp;nbsp; &lt;br /&gt;&lt;/p&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi-T_QegmccLo5GXBy6nGh7Fy0819bSRyRYqRVUj6XaI5asYkZcWR3u9anXZKXtKCQ-9zhIFV8rakTmp-Ri12qEC3FceQpWtekR7kCaY9zgjqaQtQxhjwMEwsMuay2UUxlWFjI-2rXaTHFHkpeglpiEExYqfxx9mb3SJg2dYfAqip1NxysrL7qFpQ/s1110/cyber-pipeline-splunk-view.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;500&quot; data-original-width=&quot;1110&quot; height=&quot;288&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi-T_QegmccLo5GXBy6nGh7Fy0819bSRyRYqRVUj6XaI5asYkZcWR3u9anXZKXtKCQ-9zhIFV8rakTmp-Ri12qEC3FceQpWtekR7kCaY9zgjqaQtQxhjwMEwsMuay2UUxlWFjI-2rXaTHFHkpeglpiEExYqfxx9mb3SJg2dYfAqip1NxysrL7qFpQ/w640-h288/cyber-pipeline-splunk-view.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;


&lt;h3&gt;Putting it all together&lt;/h3&gt;

&lt;p&gt;
To demonstrate all these things working together, I created a sample &lt;a href=&quot;https://github.com/alexott/databricks-cybersecurity-playground/tree/main/dlt_modern_stuff&quot;&gt;project available on GitHub&lt;/a&gt;. This project consists of three DLT pipelines that perform data ingestion and parsing, normalization of the schema to &lt;a href=&quot;https://schema.ocsf.io/&quot;&gt;Open Cybersecurity Schema Framework (OCSF)&lt;/a&gt;, and doing rudimentary detection against normalized data as it&#39;s shown on the image below:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Ingestion of Apache Web and Nginx logs into &lt;code&gt;apache_web&lt;/code&gt; table and then normalizing it into an &lt;code&gt;http&lt;/code&gt; table corresponding to &lt;a href=&quot;https://schema.ocsf.io/1.4.0/classes/http_activity?extensions=&quot;&gt;OCSF&#39;s HTTP activity&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Ingestion of Zeek data:
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Zeek HTTP data into &lt;code&gt;zeek_http&lt;/code&gt; table,  and then normalizing it into an &lt;code&gt;http&lt;/code&gt; table corresponding to &lt;a href=&quot;https://schema.ocsf.io/1.4.0/classes/http_activity?extensions=&quot;&gt;OCSF&#39;s HTTP activity&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Zeek Conn data into &lt;code&gt;zeek_conn&lt;/code&gt; table,  and then normalizing it into a &lt;code&gt;network&lt;/code&gt; table corresponding to &lt;a href=&quot;https://schema.ocsf.io/1.4.0/classes/network_activity?extensions=&quot;&gt;OCSF&#39;s Network activity&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;li&gt;Detection pipeline that does the following:
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Matches network connections data from &lt;code&gt;network&lt;/code&gt; table against &lt;code&gt;iocs&lt;/code&gt; table (it&#39;s filled with dummy data, just for a demo).&lt;/li&gt;
&lt;li&gt;Checks HTTP logs from &lt;code&gt;http&lt;/code&gt; table for admin page scans from external parties.&lt;/li&gt;
&lt;li&gt;All matches are stored in the &lt;code&gt;detections&lt;/code&gt; table, and optionally pushed to Azure Event Hubs.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjDprY7ymcwAv8yPpAnzNc4FPGpMPlyYOfaTdS7dYapTdjQ7dVok9EYWbhryongxB4BqsMF1Jseu3JZ4u78llKlxCWSwUqie2kxi-zpEYaUjdR2v3LJ5j9927smHxXj7qOdjE6qpM7rT-LMIt_962ICJUf8ut3EjP72ZXZ9lRu4j1n2U6sDehyphenhyphenjEg/s1466/cyber-pipeline-impl.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;642&quot; data-original-width=&quot;1466&quot; height=&quot;280&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjDprY7ymcwAv8yPpAnzNc4FPGpMPlyYOfaTdS7dYapTdjQ7dVok9EYWbhryongxB4BqsMF1Jseu3JZ4u78llKlxCWSwUqie2kxi-zpEYaUjdR2v3LJ5j9927smHxXj7qOdjE6qpM7rT-LMIt_962ICJUf8ut3EjP72ZXZ9lRu4j1n2U6sDehyphenhyphenjEg/w640-h280/cyber-pipeline-impl.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Follow the instructions in &lt;a href=&quot;https://github.com/alexott/databricks-cybersecurity-playground/blob/main/dlt_modern_stuff/README.md&quot;&gt;README&lt;/a&gt; to deploy, set up, and run the code.
&lt;/p&gt;

&lt;p&gt;
The execution graph for ingestion of Apache Web logs is quite simple - silver + normalized table:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqHANebFLVV8FH5YwTP409f2c8IuQckyGBTL0-74IsZdq0TcwsXSvCfJ63Bqbk8xxqV3uLB7139N4uoLJ_AnA2awaK_8s1WKuDxQ_Cd6NGERG-jPCqsaY4BKs5TLn_0T4Ji_SGmaBSmtr86HXkzXDfZGKfCQ9NYEV35ky-Q5ckwL9dBYpswm-2pA/s742/cyber-pipeline-apache.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;177&quot; data-original-width=&quot;742&quot; height=&quot;152&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqHANebFLVV8FH5YwTP409f2c8IuQckyGBTL0-74IsZdq0TcwsXSvCfJ63Bqbk8xxqV3uLB7139N4uoLJ_AnA2awaK_8s1WKuDxQ_Cd6NGERG-jPCqsaY4BKs5TLn_0T4Ji_SGmaBSmtr86HXkzXDfZGKfCQ9NYEV35ky-Q5ckwL9dBYpswm-2pA/w640-h152/cyber-pipeline-apache.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
The execution graph for Zeek data is a bit more complex, just because we have two different log types and two corresponding normalized tables:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEitKt43rwrSrSPFCFb-rC4daJDCb7az3ROo3-txK6xzNjGeYteTLmIz7Nl7UYAiof53CfgnGsrDcfoOZGHZtSESicdtajfJDJc2xY8BIMTNLhLA0hfpWWumJwSpOLWo24c5nHy0ts6FumCSnMgNpZU3pJzzNNoneMf8AFawm4e8ERJio2TVFIaoYA/s750/cyber-pipeline-zeek.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;387&quot; data-original-width=&quot;750&quot; height=&quot;330&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEitKt43rwrSrSPFCFb-rC4daJDCb7az3ROo3-txK6xzNjGeYteTLmIz7Nl7UYAiof53CfgnGsrDcfoOZGHZtSESicdtajfJDJc2xY8BIMTNLhLA0hfpWWumJwSpOLWo24c5nHy0ts6FumCSnMgNpZU3pJzzNNoneMf8AFawm4e8ERJio2TVFIaoYA/w640-h330/cyber-pipeline-zeek.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
And the execution graph for the detections pipeline is also quite simple (the actual view depends on the pipeline configuration):
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi7zvXhDct_Ev1iZjWVVWnyqaqKD_dylvOv0fPTkmzOVBZ5OMo74LxvApZd96ACJMZXpB-L6ruKP1izrvsAmUA2wVzRM5Es-EXT0aR6iHPMrYTuYi6ym3itoN5jnv4qM60RtsKhdhZ9FS1u2hCA5WW5Lc34Jwt4Zn2zk5rdOzB38Ua12N6_zJhcxA/s761/cyber-pipeline-detections.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;321&quot; data-original-width=&quot;761&quot; height=&quot;270&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi7zvXhDct_Ev1iZjWVVWnyqaqKD_dylvOv0fPTkmzOVBZ5OMo74LxvApZd96ACJMZXpB-L6ruKP1izrvsAmUA2wVzRM5Es-EXT0aR6iHPMrYTuYi6ym3itoN5jnv4qM60RtsKhdhZ9FS1u2hCA5WW5Lc34Jwt4Zn2zk5rdOzB38Ua12N6_zJhcxA/w640-h270/cyber-pipeline-detections.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
The relationships between all objects (UC volumes and tables) across all pipelines are better visible on the data lineage graph:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgi-znwl04U_CdXCkkFBQJuNiAXafvIRbuXp6Ql7c8czLpbPreoswryqR7qnMcsXYjHdjvoEqhHVp1keWEV954qeLiJWyQDdo_KO3-wtcWgqdBSN_mztu0hf0cz3Z85_Oj8lUOf2E9AeYbhvJe3ChYEYAuAsN3W5SvDcSM1LDO9MkfQaXDTgxR50A/s1462/cyber-pipeline-data-lineage.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;774&quot; data-original-width=&quot;1462&quot; height=&quot;338&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgi-znwl04U_CdXCkkFBQJuNiAXafvIRbuXp6Ql7c8czLpbPreoswryqR7qnMcsXYjHdjvoEqhHVp1keWEV954qeLiJWyQDdo_KO3-wtcWgqdBSN_mztu0hf0cz3Z85_Oj8lUOf2E9AeYbhvJe3ChYEYAuAsN3W5SvDcSM1LDO9MkfQaXDTgxR50A/w640-h338/cyber-pipeline-data-lineage.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;h3&gt;Conclusion&lt;/h3&gt;

&lt;p&gt;
The latest changes in the Delta Live Tables helped simplify the implementation of cybersecurity use cases - try DLT if you&#39;re doing cybersecurity on Databricks!  And more functionality is coming soon, stay tuned!
&lt;/p&gt;

</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/2150254748307239520/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/2150254748307239520' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2150254748307239520'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2150254748307239520'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2025/03/effective-use-of-latest-dlt-features.html' title='Efficient use of the latest DLT features for cybersecurity use cases'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi1dcpFmeejhbIHZp3XLKVVe3mKSsTzK3iA-gol8o35z-GrK0N-AQDFUGI9KfIqARkXSAmYMcbbaOrFRVVuKvgl7bpJO5vB3DBo02FnD3Dg9GPn2Vr9IZx4DG_6kh-SBE3f8fPOHnjQ_W0XOIBU079zmgC15NMn7iPF-liEymSEWGghyU6GkmVBdQ/s72-w640-h338-c/cyber-pipeline-general.png" height="72" width="72"/><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-1266772667387650035</id><published>2024-12-31T17:18:00.003+01:00</published><updated>2024-12-31T17:18:32.647+01:00</updated><title type='text'>Looking back to 2024th</title><content type='html'>&lt;p&gt;
It will be a New Year in a couple of hours and it&#39;s time for the traditional blog post…
&lt;/p&gt;

&lt;p&gt;
From the professional side, it was the &quot;year of Terraform&quot; with a lot of activity around both Databricks Terraform provider and even the core Terraform.  The total of &lt;a href=&quot;https://github.com/databricks/terraform-provider-databricks/graphs/contributors?from=01%2F01%2F2024&quot;&gt;224 my pull requests&lt;/a&gt; were merged into Databricks provider (😱, I really didn&#39;t realize that there were so many…). A lot of PRs were for &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter&quot;&gt;Terraform Exporter&lt;/a&gt; adding new resources and improving performance/stability, but besides the exporter, there were many new resources and data sources, bug fixes, doc improvements, etc.  And of course, a lot of time was spent on code reviews, issues triage, working with my colleagues on &lt;a href=&quot;https://github.com/databricks/terraform-databricks-examples/&quot;&gt;Databricks Terraform Examples&lt;/a&gt;, and other stuff.  Quite a lot of activity was around enablement:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;together with &lt;a href=&quot;https://www.linkedin.com/in/vuong-nguyen/&quot;&gt;Vuong Nguyen&lt;/a&gt; we started the year with public webinar on Terraform best practices (the recording is available in Databricks Academy under the title “Deep Dive into automating your Databricks platform using Terraform”).&lt;/li&gt;
&lt;/ul&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;in the middle of the year, I held an internal session for my colleagues in field engineering to discuss more best practices, troubleshooting, etc.&lt;/li&gt;&lt;li&gt;quite a lot of this content went into the Terraform workshop that we conduct with our customers who are interested in deep dives (contact your Databricks account team if you&#39;re interested!).&lt;/li&gt;&lt;li&gt;and some of the content went into public blog posts: &lt;a href=&quot;https://alexott.blogspot.com/2024/08/terraform-vs-databricks-asset-bundles.html&quot;&gt;Terraform vs. Databricks Asset Bundles&lt;/a&gt; (most popular), &lt;a href=&quot;https://alexott.blogspot.com/2024/09/databricks-sdks-vs-cli-vs-rest-apis-vs.html&quot;&gt;Databricks SDKs vs. CLI vs. REST APIs vs. Terraform provider vs. DABs&lt;/a&gt;, and &lt;a href=&quot;https://alexott.blogspot.com/2024/12/working-with-huge-terraform-states.html&quot;&gt;Working with huge Terraform states&lt;/a&gt;.&lt;ul class=&quot;org-ul&quot;&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
In general, quite a lot of effort was spent around automation (CI/CD, DevOps/DataOps/MLOps, …), cloud infrastructure, security, disaster recovery, etc. - all the things that should be in place to have a robust and secure data and ML platform 😜.
&lt;/p&gt;

&lt;p&gt;
Another big part of my work was concentrated on cybersecurity.  Fun fact - I came to Apache Spark more than ten years ago when I worked at McAfee, and we were building a scalable data processing platform for a new product.  At that time we selected Apache Spark because it had more potential (batch, streaming, ml, …) than other solutions (Storm, …), and time showed that we made the right choice.  Many customers realized that cybersecurity is really a big data problem (three Vs - volume/variety/velocity) and it requires the right technology to solve that problem that isn&#39;t really solvable with existing SIEM solutions.  So this year I worked with my colleagues on helping customers build their cybersecurity solutions on top of Databricks - from data ingestion to real-time and ad-hoc threat detection, reporting on cybersecurity data, and applying ML to that data.  And there were not only end customers - my colleagues and I are helping other software companies to build on top of Databricks.  And Apache Spark also evolves, allowing the easier building of integrations for cybersecurity, i.e., allowing the easier building of customer readers/writers, as it was demoed in a blog post &lt;a href=&quot;https://alexott.blogspot.com/2024/11/spark-custom-data-sources-and-sinks-for.html&quot;&gt;Spark custom data sources and sinks for cybersecurity use cases&lt;/a&gt;.
&lt;/p&gt;

&lt;p&gt;
There were a bit more &lt;a href=&quot;https://github.com/alexott?tab=overview&amp;amp;from=2024-12-01&amp;amp;to=2024-12-31&quot;&gt;GitHub activity&lt;/a&gt; compared to the last year - contributions to different OSS projects, cybersecurity-related (&lt;a href=&quot;https://github.com/alexott/cyber-spark-data-connectors&quot;&gt;custom data sources for Spark&lt;/a&gt;, &lt;a href=&quot;https://github.com/alexott/pySigma-backend-databricks&quot;&gt;pySigma backend for Databricks&lt;/a&gt; that allows converting Sigma rules into Apache Spark queries), a lot of examples for different topics, and many one-time contributions, from code fixes to improving documentation, etc. (I hope that I&#39;ll continue OSS contributions next year as well.
)&lt;br /&gt;&lt;/p&gt;&lt;p&gt;
&lt;/p&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEju8Dzasa8D24I4oskaPoSYlQwauG_q-qw2hvPckE93R1lVQyuYhy1dHjZH-N1pakacbLz1DTrxYewSB8oV2mnMp3bl7eq3ZNDjptKTErlB47E0oiBrIgUz_fjw2wdW2W7g3qPD-KLBLbXPzhJFfJfyg2QMfdLi1RLJRUwAvvOcHvTTpjWOWV27pw/s763/Screenshot%202024-12-31%20at%2016.49.46.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;196&quot; data-original-width=&quot;763&quot; height=&quot;165&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEju8Dzasa8D24I4oskaPoSYlQwauG_q-qw2hvPckE93R1lVQyuYhy1dHjZH-N1pakacbLz1DTrxYewSB8oV2mnMp3bl7eq3ZNDjptKTErlB47E0oiBrIgUz_fjw2wdW2W7g3qPD-KLBLbXPzhJFfJfyg2QMfdLi1RLJRUwAvvOcHvTTpjWOWV27pw/w640-h165/Screenshot%202024-12-31%20at%2016.49.46.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This year was very interesting from a professional standpoint, and I hope that next year will be as well.
&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
I wish my readers a happy and prosperous New Year!
&lt;/p&gt;
&lt;br /&gt;</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/1266772667387650035/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/1266772667387650035' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1266772667387650035'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1266772667387650035'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2024/12/looking-back-to-2024th.html' title='Looking back to 2024th'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEju8Dzasa8D24I4oskaPoSYlQwauG_q-qw2hvPckE93R1lVQyuYhy1dHjZH-N1pakacbLz1DTrxYewSB8oV2mnMp3bl7eq3ZNDjptKTErlB47E0oiBrIgUz_fjw2wdW2W7g3qPD-KLBLbXPzhJFfJfyg2QMfdLi1RLJRUwAvvOcHvTTpjWOWV27pw/s72-w640-h165-c/Screenshot%202024-12-31%20at%2016.49.46.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-7644938503571773477</id><published>2024-12-27T13:18:00.004+01:00</published><updated>2024-12-27T13:19:22.218+01:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="terraform"/><title type='text'>Working with huge Terraform states</title><content type='html'>&lt;p&gt;
If you regularly work with Terraform, you should be familiar with the best practices regarding the number of resources in a single state. For example, &lt;a href=&quot;https://cloud.google.com/docs/terraform/best-practices/root-modules#minimize-resources&quot;&gt;Google&#39;s best practices for Terraform&lt;/a&gt; recommend not to include more than 100 resources (and ideally only a few dozen) in a single state.
&lt;/p&gt;

&lt;p&gt;
If you have too many resources in the state it affects many things:
&lt;/p&gt;


&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Planning takes too long. You usually need to perform a state refresh for
 existing resources to check their presence and configuration and decide
 if any changes should be made, and this will happen even for small 
changes. It typically requires performing API calls that are often 
rate-limited, so you can&#39;t get information fast even with high 
parallelism. (For example, a general limit for Databricks APIs is 30 
requests/seconds, and it&#39;s lower for some APIs.)&amp;nbsp; &lt;br /&gt;&lt;/li&gt;&lt;li&gt;A similar problem is with the &lt;span style=&quot;font-family: verdana;&quot;&gt;&lt;span style=&quot;font-size: x-small;&quot;&gt;apply&lt;/span&gt;&lt;/span&gt; - you will most probably be rate-limited when creating/modifying/deleting resources or getting information via data sources.&lt;/li&gt;
&lt;li&gt;increased blast radius - it&#39;s harder to review changes in big plans, and if something goes wrong, it may affect all resources.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
With correct code organization, splitting code into modules, etc. we can avoid having too many resources in a single state, but it will be a topic for a separate blog post.
&lt;/p&gt;

&lt;p&gt;
In reality you not always can follow the best practices, and you may end up having 10th or 100th of thousands of objects in the same state.  In my own practice, I saw this problem in the following Databricks-related cases:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Centralized provisioning of users/groups - typically this happens when existing solutions, such as Microsoft SCIM connector, don&#39;t provide enough flexibility.&lt;/li&gt;
&lt;li&gt;Centralized provisioning of Unity Catalog objects - catalogs, schemas, etc.  Usually, it&#39;s an anti-pattern and is solved by correct code organization, splitting into multiple states, etc.&lt;/li&gt;
&lt;li&gt;Workspace migrations - typically they are done by using the &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter&quot;&gt;Databricks Terraform Exporter&lt;/a&gt; to generate Terraform code for workspace content, and then applying it to the destination workspace.  However they are usually done once, so the slowness during the plan/apply isn&#39;t a big problem.&lt;/li&gt;
&lt;li&gt;Disaster Recovery (DR) - the usual recommendation is to use Terraform from the beginning to deploy all necessary resources, and follow the best practices on code organization.  But it&#39;s not always possible in some cases, i.e., a customer doesn&#39;t use IaC solutions for deployments, there is a lot of content generated by users working interactively, etc.  In this case, customers try to use Terraform Exporter to periodically generate the code and apply it to a destination workspace. (Native DR implementation is coming soon, so we won&#39;t need this approach anymore).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
During this year I worked on supporting disaster recovery solution for a Databricks workspace used by more than ten thousand users to interactively develop code using Databricks Notebooks, SQL queries, and dashboards, and all this work should be replicated daily to a backup workspace, including not only the content, but also permissions and other related things.  A typical approach in this case is to use Git repositories to store the code, and then only repositories should be replicated, so we won&#39;t have too many objects in the Terraform state. But in this specific case, the usage of Git was blocked by the customer&#39;s security team, and we needed to replicate ~400k notebooks, plus necessary objects such as directories in the workspace, plus permissions for both notebooks and directories, increasing the state size to approximately 600k resources.
&lt;/p&gt;

&lt;p&gt;
The implemented solution was quite standard:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Use the Terraform exporter to generate Terraform code and associated objects (like files with notebook code).&lt;/li&gt;
&lt;li&gt;Apply changes to the destination workspace by performing plan/apply.  Because the apply direction was almost always primary -&amp;gt; backup and users didn&#39;t work in the backup workspace until the DR event, we saved a lot of time by skipping refresh in the plan.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
Initially, I concentrated on the first item - making the Exporter run as fast as possible, improving the implementation of the incremental export mode, etc. (I need to write a separate blog post about this part).  But we quickly found that the export phase wasn&#39;t the main problem - it was Terraform itself.  We simply couldn&#39;t run plan/apply fast enough to meet the target SLAs even with increased API limits and Terraform execution parallelism. This led us to look into the Terraform code more deeply and make some changes in the Terraform (and OpenTofu) code to improve the performance. 
&lt;/p&gt;

&lt;p&gt;
The major bottlenecks identified were related to the Terraform architecture and implementation:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;not optimized code - the code in some places is written in a naive style.  In a normal situation, it&#39;s not visible for users because performance problems arise when you&#39;re working with 10th of thousands of resources.  For example, the code in &lt;span style=&quot;font-family: verdana;&quot;&gt;&lt;code&gt;&lt;span style=&quot;font-size: medium;&quot;&gt;AttachResourceConfigTransformer&lt;/span&gt;&lt;/code&gt;&lt;/span&gt; that is called in both &lt;span style=&quot;font-size: medium;&quot;&gt;&lt;code&gt;plan&lt;/code&gt;&lt;/span&gt; and &lt;code&gt;&lt;span style=&quot;font-size: medium;&quot;&gt;apply&lt;/span&gt;&lt;/code&gt; phases had an N&lt;sup&gt;2&lt;/sup&gt; complexity, and with 400k resources it took ~18 hours to execute.  The &lt;a href=&quot;https://github.com/hashicorp/terraform/pull/35088&quot;&gt;fix was relatively small&lt;/a&gt;, but allowed to decrease execution time to just a few minutes.  There are other places where we still have N&lt;sup&gt;2&lt;/sup&gt; complexity, but N there is not a number of resources, but a number of changes, so it was ok for us as a typical change set is just a couple of thousand resources.&lt;/li&gt;
&lt;li&gt;copy the entire state on each change - during the apply phase, each resource instance copies the whole state to make changes in it (deep copy, including every nested value).  With a huge state, occupying dozens or hundred megabytes of memory, it puts a lot of pressure on the Go&#39;s garbage collector, leading to a situation when almost half of the execution time is spent there.  We also identified that there was an extra copy done, leading to the production of more garbage than required.  This extra copying was also &lt;a href=&quot;https://github.com/hashicorp/terraform/commit/e8119cced39bdd9edf0944f35a8c980336c2c6d1&quot;&gt;fixed by the Hashicorp team&lt;/a&gt; decreasing the pressure on garbage collector a bit.  Although the correct solution would be to implement copy-on-write and do it only for affected values, this will be a bigger architectural change, so it was postponed.&lt;/li&gt;
&lt;li&gt;global lock around the state - to make changes to the state a specific resource instance needs to acquire the lock.  In a typical situation, it&#39;s almost not visible to a user, but because it&#39;s coupled with the deep copy (described in the previous item), in case of the huge state this operation is much slower, leading to slow execution even with high execution parallelism.  In our evaluations, we found that with ~600k resources, we can get approximately 2 operations per second, even with parallelism set to the hundred.  Fixing this problem also will require a re-architecture of the Terraform, so we didn&#39;t do anything there.&lt;/li&gt;
&lt;li&gt;checkpointing the state - when the remote state backend is used (as recommended), Terraform performs changes to the state in memory, and then periodically saves it to remote storage.  By default, it&#39;s done every 20 seconds and it&#39;s a &quot;stop the world&quot; operation when no changes are made to the state.  With a huge state size, the overhead of serializing the state as JSON (see the next item) and saving it to remote storage becomes very significant.  Technically it would be best to implement checkpointing as a separate goroutine to perform it asynchronously and not block the execution, but required a lot of work to handle all edge cases, so we went with the simpler solution - made the checkpoint interval configurable.    Now it&#39;s possible to &lt;a href=&quot;https://github.com/hashicorp/terraform/commit/a72d02135bd33d6d581d6e0df6e15882c26e8d20#diff-6627e3489968d07107612d76f924dc2cfc0aa526715a5fef572f55560f6e1912R743&quot;&gt;set the checkpoint interval&lt;/a&gt; via &lt;span style=&quot;font-size: medium;&quot;&gt;&lt;code&gt;TF_STATE_PERSIST_INTERVAL&lt;/code&gt;&lt;/span&gt; environment variable, decreasing the number of &quot;stop the world&quot; operations during the execution (it was ok for our case because operations on notebooks and other workspace objects are idempotent).  Note: the local state backend is much worse as it checkpoints after each change.&lt;/li&gt;
&lt;li&gt;JSON representation of the state - Terraform uses JSON format to save the state.  By default, it uses a pretty-printed representation that it&#39;s easy to read by humans, but it&#39;s very inefficient - the space character that is used for code indenting occupies approximately 20-25% of the total file size, significantly increasing serialized state and as result, upload times (on relatively slow links).  When using compact JSON representation we can significantly decrease state file size and this was &lt;a href=&quot;https://github.com/hashicorp/terraform/pull/35175&quot;&gt;confirmed by the implementation&lt;/a&gt; (unfortunately, the PR is still not accepted into Terraform, only to OpenTofu).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
With the fixes released as part of Terraform 1.9, we were able to reach our SLAs. With ~600k resources in the state and 3-10k changes per day, the plan takes approximately 1-1.5 hours, and the apply time - is one to three hours, depending on the number of changes per day.
&lt;/p&gt;


&lt;h4 style=&quot;text-align: left;&quot;&gt;
Conclusion
&lt;/h4&gt;

&lt;p&gt;
It&#39;s possible (but not recommended) to use Terraform with tens of thousands of resources if you understand how Terraform works, the limitations of architecture, and you can tune it accordingly (checkpoint interval, parallelism, etc.).
&lt;/p&gt;

&lt;p&gt;
One of the general observations was that performance degrades non-linearly, and slowness due to memory copying and other factors starts after 50-70k resources in the state. So, if you can split your huge state into multiple chunks of smaller size that could be applied independently, it will help with the performance of Terraform itself, although you may still hit the limits of APIs used by specific Terraform providers.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/7644938503571773477/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/7644938503571773477' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7644938503571773477'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7644938503571773477'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2024/12/working-with-huge-terraform-states.html' title='Working with huge Terraform states'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-4148409637189933855</id><published>2024-11-24T12:58:00.004+01:00</published><updated>2024-11-24T13:03:45.402+01:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cybersecurity"/><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="pyspark"/><category scheme="http://www.blogger.com/atom/ns#" term="spark"/><title type='text'>Spark custom data sources and sinks for cybersecurity use cases</title><content type='html'>&lt;p&gt;
It&#39;s very common in cybersecurity that we need to load from different sources (i.e., load data from threat feeds) or write data to external systems (i.e., push data to SIEM/SOAR).  Apache Spark is a great tool for crunching big amounts of cybersecurity data, in a batch or streaming manner.  Out of the box, Spark has built-in data sources and sinks for file-based formats and event streaming systems (such as Kafka), but its integration with other external systems isn&#39;t very trivial. Typically you work with them using REST APIs, and then you need to have different implementations for batch and streaming use cases, mixing that implementation complexity (i.e.,&amp;nbsp; &lt;span style=&quot;font-size: medium;&quot;&gt;&lt;code&gt;foreachBatch&lt;/code&gt;&lt;/span&gt;) with actual business logic.
&lt;/p&gt;

&lt;p&gt;
The upcoming release of Apache Spark 4 includes &lt;a href=&quot;https://spark.apache.org/docs/preview/api/python/user_guide/sql/python_data_source.html&quot;&gt;PySpark DataSource API&lt;/a&gt; (already included into &lt;a href=&quot;https://docs.databricks.com/en/pyspark/datasources.html&quot;&gt;Databricks Runtime 15.3+&lt;/a&gt;) that greatly simplify the task of integrating with external systems.  Now we can easily add a custom data source implementation and then use it the same way as built-in data sources and sinks - just specify the name of your custom data source in the&amp;nbsp;&lt;span style=&quot;font-size: medium;&quot;&gt;&lt;code&gt;format&lt;/code&gt;&lt;/span&gt;, and your implementation will be called to handle reads or writes (both batch and streaming, if your implementation supports it):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;MyDataSource&lt;/span&gt;(DataSource):
    @&lt;span style=&quot;color: darkslateblue;&quot;&gt;classmethod&lt;/span&gt;
    &lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;name&lt;/span&gt;(cls):
        &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;my-source&quot;&lt;/span&gt;

    &lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;writer&lt;/span&gt;(&lt;span style=&quot;color: #a020f0;&quot;&gt;self&lt;/span&gt;, schema: StructType, overwrite: &lt;span style=&quot;color: darkslateblue;&quot;&gt;bool&lt;/span&gt;):
        ...


spark.dataSource.register(MyDataSource)
&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;df is read from some source&lt;/span&gt;
df.write.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;my-source&quot;&lt;/span&gt;).mode(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;overwrite&quot;&lt;/span&gt;).save()
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
To play with the new data source APIs I decided to implement sinks for a typical task - push data (such as, detections, alerts, etc.) to external systems, such as, Splunk (I&#39;m thinking about supporting for reads as well).  &lt;a href=&quot;https://github.com/alexott/cyber-spark-data-connectors&quot;&gt;The implementation&lt;/a&gt; is quite simple, but it greatly simplifies integration now - instead of using&amp;nbsp;&lt;span style=&quot;font-size: medium;&quot;&gt;&lt;code&gt;foreachBatch&lt;/code&gt;&lt;/span&gt; with your stream or calling REST API from &lt;span style=&quot;font-family: verdana; font-size: medium;&quot;&gt;&lt;code&gt;mapInPandas&lt;/code&gt;&lt;/span&gt; you now can just say &lt;code&gt;&lt;span style=&quot;font-size: medium;&quot;&gt;.format(&quot;splunk&quot;)&lt;/span&gt;&lt;/code&gt;, and provide necessary options, and the data source implementation will take care for calling necessary APIs, and it works the same for both batch and streaming use cases.
&lt;/p&gt;

&lt;p&gt;
First we need to register our data source:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;from&lt;/span&gt; cyber_connectors &lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; *

spark.dataSource.register(SplunkDataSource)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And then we can just use it to write data to Splunk providing necessary options such as &lt;span style=&quot;font-size: x-small;&quot;&gt;&lt;span style=&quot;font-family: verdana;&quot;&gt;url&lt;/span&gt;&lt;/span&gt; and &lt;span style=&quot;font-family: verdana;&quot;&gt;&lt;span style=&quot;font-size: x-small;&quot;&gt;token&lt;/span&gt;&lt;/span&gt; so the data source knows where to send data and how to authenticate (see &lt;a href=&quot;https://github.com/alexott/cyber-spark-data-connectors?tab=readme-ov-file#splunk-data-source&quot;&gt;README&lt;/a&gt; for the list of supported options).  For example, I have some Zeek HTTP logs coming as JSON files, and I can easily push them to Splunk:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;span style=&quot;color: sienna;&quot;&gt;dir_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;tests/samples/json/&quot;&lt;/span&gt;
&lt;span style=&quot;color: sienna;&quot;&gt;bdf&lt;/span&gt; = spark.read.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;json&quot;&lt;/span&gt;).load(dir_name)  &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;to infer schema - not use in the prod!&lt;/span&gt;

&lt;span style=&quot;color: sienna;&quot;&gt;sdf&lt;/span&gt; = spark.readStream.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;json&quot;&lt;/span&gt;).schema(bdf.schema).load(dir_name)
&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;apply some filtering here to detect suspicious events&lt;/span&gt;

&lt;span style=&quot;color: sienna;&quot;&gt;stream_options&lt;/span&gt; = {
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;url&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;http://localhost:8088/services/collector/event&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;token&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;....&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;source&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;zeek&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;index&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;zeek&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;host&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;my_host&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;time_column&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;ts&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;checkpointLocation&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;/tmp/splunk-checkpoint/&quot;&lt;/span&gt;
}
&lt;span style=&quot;color: sienna;&quot;&gt;stream&lt;/span&gt; = sdf.writeStream.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;splunk&quot;&lt;/span&gt;) \
  .trigger(availableNow=&lt;span style=&quot;color: darkcyan;&quot;&gt;True&lt;/span&gt;) \
  .options(**stream_options).start()
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And I can see the data in my Splunk instance:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiAgcjVg8tn8BCNGl5lgQaLs-WWlW1x8_ILbdrYlg4p40fPjM2YPF9PcwuWBhSsDemvdj89Tkl3hT4ZV_ppV49hq0N7Xr8T2GgOAxrS_xz7J8q3yIMISXcKAiN_vmBhjuRVnTeIm65O6pObyE3KJt2kHNIpXMIla02HL4eMoZo3_yttnwsPe7y1WA/s968/splunk-zeek.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;968&quot; data-original-width=&quot;838&quot; height=&quot;640&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiAgcjVg8tn8BCNGl5lgQaLs-WWlW1x8_ILbdrYlg4p40fPjM2YPF9PcwuWBhSsDemvdj89Tkl3hT4ZV_ppV49hq0N7Xr8T2GgOAxrS_xz7J8q3yIMISXcKAiN_vmBhjuRVnTeIm65O6pObyE3KJt2kHNIpXMIla02HL4eMoZo3_yttnwsPe7y1WA/w554-h640/splunk-zeek.png&quot; width=&quot;554&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;


&lt;p&gt;
And that&#39;s all! My code is now concentrated on handling my business logic and is not polluted with some implementation details.  If necessary, I can switch to another external system by just changing &lt;code&gt;&lt;span style=&quot;font-size: medium;&quot;&gt;.format(&quot;splunk&quot;)&lt;/span&gt;&lt;/code&gt; to &lt;code&gt;&lt;span style=&quot;font-size: medium;&quot;&gt;.format(&quot;something-else&quot;)&lt;/span&gt;&lt;/code&gt;.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/4148409637189933855/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/4148409637189933855' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/4148409637189933855'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/4148409637189933855'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2024/11/spark-custom-data-sources-and-sinks-for.html' title='Spark custom data sources and sinks for cybersecurity use cases'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiAgcjVg8tn8BCNGl5lgQaLs-WWlW1x8_ILbdrYlg4p40fPjM2YPF9PcwuWBhSsDemvdj89Tkl3hT4ZV_ppV49hq0N7Xr8T2GgOAxrS_xz7J8q3yIMISXcKAiN_vmBhjuRVnTeIm65O6pObyE3KJt2kHNIpXMIla02HL4eMoZo3_yttnwsPe7y1WA/s72-w554-h640-c/splunk-zeek.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-6486036865781172579</id><published>2024-09-16T12:26:00.000+02:00</published><updated>2024-09-16T12:26:18.763+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="devops"/><category scheme="http://www.blogger.com/atom/ns#" term="terraform"/><title type='text'>Databricks SDKs vs. CLI vs. REST APIs vs. Terraform provider vs. DABs</title><content type='html'>&lt;p&gt;
The &lt;a href=&quot;https://alexott.blogspot.com/2024/08/terraform-vs-databricks-asset-bundles.html&quot;&gt;previous blog post&lt;/a&gt; about Databricks Terraform provider vs. Databricks Asset Bundles (DABs) was quite successful, but it didn&#39;t cover all possible application areas. So, there were requests for a follow-up post covering other tools, such as Databricks CLI, SDKs, and REST APIs, and when to use them compared to Databricks Terraform provider and DABs.
&lt;/p&gt;

&lt;h2 id=&quot;org9e9b4d9&quot;&gt;Databricks REST API&lt;/h2&gt;

&lt;p&gt;
The &lt;a href=&quot;https://docs.databricks.com/api/&quot;&gt;Databricks REST API&lt;/a&gt; is the foundation for all other tools. All interactions with the Databricks Platform happen via it and you have full control over what you&#39;re doing.  But with the great power, you&#39;re now responsible for handling all nuances of API usage:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Authentication: Multiple authentication methods are supported, but, for example, you need to generate and renew OAuth tokens yourselves.&lt;/li&gt;
&lt;li&gt;Implementation details, like pagination in list API: different APIs use different pagination methods, and you need to understand all the details of each (note: the unification is in progress, but it takes time).&lt;/li&gt;
&lt;li&gt;Error handling: You need to retry the call when you get the HTTP 429 status code (rate limit) and some other situations, or stop processing if you get other, non-retryable errors.&lt;/li&gt;
&lt;li&gt;Some services, such as clusters, model serving, etc., are starting their objects, and you need to wait until they successfully start to declare the success. This could be done by continuous polling, but you shouldn&#39;t overload APIs by polling too often and shouldn&#39;t waste time polling rarely.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2 id=&quot;orgb64d838&quot;&gt;Databricks SDKs&lt;/h2&gt;
&lt;p&gt;
Databricks provides a number of SDKs for different languages (officially supported are for &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/sdk-go.html&quot;&gt;Go&lt;/a&gt;, &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/sdk-python.html&quot;&gt;Python&lt;/a&gt;, and &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/sdk-java.html&quot;&gt;Java&lt;/a&gt; languages).  All these SDKs are generated from the same source - API specification that describes the whole Databricks REST API surface.  Having SDKs generated from the same source has a big advantage - SDKs get new functionality as soon as new/updated APIs are published.  Another great thing is that APIs and their usage in different languages are quite similar to each other (taking into account language differences), so it&#39;s easier to switch between different languages.
&lt;/p&gt;

&lt;p&gt;
SDKs solve all the problems described above by providing:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Authentication - you can authenticate using all &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/auth/index.html&quot;&gt;supported authentication methods&lt;/a&gt; (PATs, Databricks/Azure/GCP user-to-machine and machine-to-machine OAuth, …).  You can provide authentication parameters either explicitly when creating an API client, via environment variables, or have a mix of them.  SDKs also support using information from configuration profiles defined in Databricks CLI&#39;s configuration file.  And when you&#39;re running it from a Databricks Notebook, you don&#39;t even need to specify any authentication parameters - everything will be configured automatically.&lt;/li&gt;
&lt;li&gt;Abstracting away implementation details, such as pagination implementation, you just call &lt;code&gt;Clusters.ListAll&lt;/code&gt; and don&#39;t worry about what pagination method is used by the specific API.&lt;/li&gt;
&lt;li&gt;Handling retries and errors - SDKs automatically retry the call if it hits rate limits or other conditions that allow the action to be tried again.&lt;/li&gt;
&lt;li&gt;Providing auxiliary methods, such as &lt;code&gt;GetByName&lt;/code&gt; to get an object by its name or &lt;code&gt;WaitGetClusterRunning&lt;/code&gt; to wait for a cluster creation - all these methods are generated automatically for most services. But SDKs also include manually written auxiliary methods, such as &lt;code&gt;Clusters.SelectNodeType&lt;/code&gt; or &lt;code&gt;Clusters.that, SelectSparkVersion&lt;/code&gt; allow the building of cloud-agnostic code (similar to &lt;a href=&quot;https://alexott.blogspot.com/2022/11/cloud-agnostic-resources-deployment.html&quot;&gt;this Terraform example&lt;/a&gt;).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
In general, the use of Databricks SDKs is very simple:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;you create an instance of workspace or account client&lt;/li&gt;
&lt;li&gt;you use methods of specific service exposed by the client - clusters, jobs, etc.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Here is a simple example of listing all jobs in the workspace using Python SDK (authentication parameters will be taken from the notebook environment or environment variables):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;from&lt;/span&gt; databricks.sdk &lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; WorkspaceClient&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;w&lt;/span&gt; = WorkspaceClient()&lt;/code&gt;
&lt;code&gt;&lt;span style=&quot;color: sienna;&quot;&gt;job_list&lt;/span&gt; = w.jobs.&lt;span style=&quot;color: darkslateblue;&quot;&gt;list&lt;/span&gt;(expand_tasks=&lt;span style=&quot;color: darkcyan;&quot;&gt;False&lt;/span&gt;)&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
More complex examples could be found in &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/sdk-go.html#examples&quot;&gt;documentation&lt;/a&gt; and in &lt;a href=&quot;https://github.com/databrickslabs/sandbox&quot;&gt;Databricks Labs Sandbox&lt;/a&gt; repository.
&lt;/p&gt;

&lt;h2 id=&quot;org836b82a&quot;&gt;Databricks CLI&lt;/h2&gt;

&lt;p&gt;
&lt;a href=&quot;https://docs.databricks.com/en/dev-tools/cli/index.html&quot;&gt;Databricks CLI&lt;/a&gt; is built on top of Databricks Go SDK and provides an easy-to-use interface to interact with Databricks Platform from the command line (on both workspace and account levels).  CLI is also a home for &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/bundles/index.html&quot;&gt;Databricks Asset Bundles&lt;/a&gt; that greatly simplify deployment and promotion of the code and other assets to the Databricks Platform.
&lt;/p&gt;

&lt;p&gt;
Because it&#39;s built on top of Go SDK, it inherits all its capabilities but provides an easier-to-use interface to perform specific tasks - create or start clusters, list jobs, etc. That&#39;s ideal for one-time use, or for scripting.  But we still need to take care of providing the right payload, such as JSON-encoded cluster or job specification, etc., the same as for the corresponding REST APIs.  These payloads could be quite complex, and not so portable if we talk about references to cluster policies, instance pools, DLT pipelines, or other &quot;external&quot; references.  For such cases, it&#39;s better to use DABs or the Databricks Terraform provider to define the environment consisting of multiple objects and deploy them in the right order, with references, etc.
&lt;/p&gt;

&lt;p&gt;
One great part of Databricks CLI is the ability to define a configuration profile - a named entity describing a specific environment - primarily these are authentication parameters, like, host, token, etc., but it&#39;s possible to specify other configurations as well.  After the profile is defined we can easily use that configuration by specifying only its name, without the need to specify all parameters together.  I.e., it&#39;s easy to export workspace objects (notebooks, workspace files, etc.) from one workspace and import them into another using the following command:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-shell&quot;&gt;&lt;code&gt;databricks --profile ws1 workspace export-dir -o &lt;span style=&quot;color: #008b00;&quot;&gt;&#39;/Users/...&#39;&lt;/span&gt; local-dir &amp;amp;&amp;amp; &lt;span style=&quot;color: #008b00;&quot;&gt;\&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;    databricks --profile ws2 workspace import-dir -o local-dir &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;/...&quot;&lt;/span&gt;&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
Profiles can also be used by SDKs and Terraform providers, making it easy to reuse the same code by specifying environment variables to specify which profile should be used instead of hardcoding configuration in the code or specifying multiple environment variables.
&lt;/p&gt;

&lt;h2 id=&quot;orgd0ace94&quot;&gt;When to use what?&lt;/h2&gt;

&lt;p&gt;
To decide what tool to use I typically ask myself a very simple question - what I want to achieve?
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;If I want to define some &quot;environment&quot; (especially complex, consisting of multiple objects, like, a job with multiple tasks of different types), and keep its configuration up to date - then use DABs or Terraform.  These tools will take care of tracking what objects are already created, what configuration they have, etc., and make changes if necessary to bring them to the desired state.   DABs provide an additional functionality on top of it, like, starting a DLT pipeline or job and wait for its execution (it&#39;s not available in Terraform by default).&lt;/li&gt;
&lt;li&gt;If you need to perform some action - use Databricks CLI or SDKs for the language of your choice.  (These actions are typically stateless):
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;CLI is ideal for one-time actions, like, start cluster, list jobs, etc.  As soon as you need to implement more complex logic, you will start to chain CLI calls using shell, and it will become an unsupported mess (believe me, I wrote and supported huge shell scripts ;-)&lt;/li&gt;
&lt;li&gt;SDKs are ideal for implementing complex logic - you can use the full power of selected programming language with abstractions provided by SDKs.  With SDKs, it&#39;s easy to implement custom tasks, i.e., &lt;a href=&quot;https://github.com/alexott/databricks-playground/tree/main/pause-unpause-jobs&quot;&gt;find all scheduled/triggered jobs, and pause/unpause them&lt;/a&gt;, or &lt;a href=&quot;https://github.com/alexott/databricks-playground/tree/main/deactivate-activate-users-sps&quot;&gt;deactivate/reactivate all non-admin users/service principals in the workspace&lt;/a&gt;, etc.  See more examples in the &lt;a href=&quot;https://github.com/databrickslabs/sandbox&quot;&gt;Databricks Labs Sandbox&lt;/a&gt; repository.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
The direct use of the Databricks REST APIs &lt;i&gt;&lt;b&gt;should be the last resort&lt;/b&gt;&lt;/i&gt; due to the need to handle authentication, retries, implementation details (i.e., pagination), etc. yourself.  Although there are still cases when you can select to use REST APIs:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;There is no SDK for your language. The best approach would be to raise a request to add one for your language of choice, and use REST API directly until it&#39;s available.  With correct design of your programs, you can easily swap your direct implementation with SDKs.&lt;/li&gt;
&lt;li&gt;SDKs don&#39;t provide the necessary functionality yet - typically this happens with APIs that are in the private preview, so the API specification isn&#39;t updated yet.  In this case, you can still use CLI and SDKs - they provide a raw interface to REST APIs, handling things like authentication and error handling/retries:
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;For CLI the raw interface is available as &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/cli/api-commands.html&quot;&gt;databricks api&lt;/a&gt; commands ().&lt;/li&gt;
&lt;li&gt;SDKs provide an low-level API client that is used by both workspace and account-level clients under the hood.  For example, Python SDK has the &lt;a href=&quot;https://github.com/databricks/databricks-sdk-py/blob/main/databricks/sdk/core.py#L27&quot;&gt;ApiClient class&lt;/a&gt; that could be used to call an arbitrary REST API.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h2 id=&quot;org61dc407&quot;&gt;Conclusion&lt;/h2&gt;

&lt;p&gt;
I hope that this blog post will help you identify and start using the right tool for your Databricks automation journey. I would be really grateful for your feedback!
&lt;/p&gt;

</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/6486036865781172579/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/6486036865781172579' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6486036865781172579'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6486036865781172579'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2024/09/databricks-sdks-vs-cli-vs-rest-apis-vs.html' title='Databricks SDKs vs. CLI vs. REST APIs vs. Terraform provider vs. DABs'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-8699962226684466945</id><published>2024-08-01T15:10:00.000+02:00</published><updated>2024-08-01T15:10:00.285+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="devops"/><category scheme="http://www.blogger.com/atom/ns#" term="terraform"/><title type='text'>Terraform vs. Databricks Asset Bundles</title><content type='html'>&lt;p&gt;I often get questions from customers and my colleagues: We have&amp;nbsp; &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs&quot; target=&quot;_blank&quot;&gt;Databricks Terraform Provider&lt;/a&gt; and &lt;a href=&quot;https://docs.databricks.com/en/dev-tools/bundles/index.html&quot;&gt;Databricks Asset Bundles&lt;/a&gt; (DABs), and they have overlapping functionality—what should we use for deploying my data processing and machine learning pipelines and what should we use for deploying the infrastructure? I recently presented internally, and as part of this presentation, I tried to formulate specific guidance on that topic...&lt;/p&gt;&lt;h2 style=&quot;text-align: left;&quot;&gt;Typical Challenges when using Terraform&lt;/h2&gt;&lt;p&gt;One of the most significant challenges with Terraform is that many data engineers, data scientists, and machine learning engineers are not familiar with it. Terraform is predominantly a tool used by DevOps and infrastructure teams, and its steep learning curve can be a barrier for those who primarily work with data and machine learning models. This lack of familiarity often leads to a reliance on DevOps teams to manage infrastructure, which can slow down the development process.&lt;br /&gt;&lt;br /&gt;Managing Terraform code across multiple environments (development, staging, production) requires careful planning and organization. The need to modularize code and create environment-specific configurations adds complexity. Tools like Terragrunt can help by providing a wrapper that simplifies some of these tasks, but it is not a perfect solution and still requires significant setup and maintenance. Often, customers end up relying on pre-built templates provided by their DevOps teams, which can limit flexibility and autonomy for developers.&lt;br /&gt;&lt;br /&gt;Terraform requires a state file to keep track of the resources it manages. When deploying from CI/CD pipelines, this state must be stored somewhere accessible, typically in cloud storage. However, managing permissions and access to this state file can be problematic, especially in large organizations with stringent security policies. Issues with state management can lead to failed deployments and require manual intervention, further complicating the deployment process.&lt;br /&gt;&lt;/p&gt;&lt;h2 style=&quot;text-align: left;&quot;&gt;&amp;nbsp;How DABs Solve These Pain Points&lt;/h2&gt;&lt;p&gt;DABs allow users to specify multiple environments (development, staging, production) in a single configuration file. This streamlined approach reduces the need for extensive modularization and environment-specific code. Additionally, the -t switch enables easy deployment to different environments by overriding environment-specific parameters, making it straightforward to integrate into CI/CD pipelines.&lt;br /&gt;Databricks Asset Bundles (DABs) use Terraform under the hood, but they abstract away much of the complexity. This means that data engineers, data scientists, and ML engineers can deploy infrastructure without needing deep knowledge of Terraform. By simplifying the interface, DABs make it easier for these professionals to manage their own infrastructure needs.&lt;br /&gt;DABs handle state management by using workspace files to store the Terraform state. This approach eliminates the need for dedicated cloud storage and simplifies permission management. With DABs, developers do not have to worry about where and how to store state files, reducing the potential for deployment issues related to state management.&lt;br /&gt;By addressing the above challenges, DABs reduce the load on infrastructure teams and provide more autonomy to developers. This autonomy allows data professionals to implement integration tests and manage their own deployments without heavy reliance on DevOps teams, leading to faster development cycles and more efficient workflows.&lt;br /&gt;&lt;/p&gt;&lt;h2 style=&quot;text-align: left;&quot;&gt;DABs vs. Terraform - when to use what&lt;br /&gt;&lt;/h2&gt;&lt;p&gt;If your organization does not have a robust DevOps framework in place or if your engineering team is not well-versed in Terraform, adopting DABs can be highly beneficial. DABs provide a more accessible and streamlined way to manage infrastructure, allowing data professionals to focus on their core tasks without being bogged down by infrastructure complexities.&lt;br /&gt;&lt;/p&gt;&lt;h3 style=&quot;text-align: left;&quot;&gt;When to Use Terraform&lt;/h3&gt;&lt;p&gt;Terraform remains a powerful tool for managing large-scale infrastructure and is well-suited for the following tasks:&lt;br /&gt;&lt;/p&gt;&lt;ul style=&quot;text-align: left;&quot;&gt;&lt;li&gt;&lt;b&gt;Deployment of Workspaces and Related Cloud Infrastructure&lt;/b&gt;: Use Terraform to set up foundational components like workspaces and the associated cloud resources.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Assignment of Groups/Users/Service Principals to Workspaces&lt;/b&gt;: Manage access control and user assignments with Terraform to ensure secure and organized access to resources.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Deployment of Workspace-Level Resources&lt;/b&gt;: Terraform is ideal for deploying shared resources such as cluster policies, groups, and permissions at the workspace level.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Management of Major Unity Catalog Objects&lt;/b&gt;: Deploy and manage essential catalog objects like metastore, catalogs, and grants with Terraform for a structured data governance framework.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;&lt;/p&gt;&lt;h3 style=&quot;text-align: left;&quot;&gt;When to Use DABs&lt;/h3&gt;&lt;p&gt;DABs are particularly effective for managing project-level artifacts and promoting them between environments. Consider using DABs for:&lt;br /&gt;&lt;/p&gt;&lt;ul style=&quot;text-align: left;&quot;&gt;&lt;li&gt;&lt;b&gt;Deployment of Project-Level Artifacts&lt;/b&gt;: DABs can deploy data pipelines, workflows, and other project-specific resources. Although not all resources are currently supported, DABs provide a straightforward way to manage these artifacts.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Environment Promotion and CI/CD Integration&lt;/b&gt;: DABs excel at promoting artifacts between environments and integrating them into CI/CD pipelines, simplifying the process of moving changes from development to production.&lt;/li&gt;&lt;/ul&gt;&lt;h2 style=&quot;text-align: left;&quot;&gt;Conclusion&lt;br /&gt;&lt;/h2&gt;&lt;p&gt;In summary, while Terraform is a robust tool for infrastructure management, DABs offer a more accessible and streamlined approach for data professionals. By leveraging the strengths of both tools, organizations can optimize their infrastructure management processes and empower their teams to work more efficiently.&lt;br /&gt;&lt;br /&gt;&lt;/p&gt;</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/8699962226684466945/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/8699962226684466945' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/8699962226684466945'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/8699962226684466945'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2024/08/terraform-vs-databricks-asset-bundles.html' title='Terraform vs. Databricks Asset Bundles'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-2755981010880887659</id><published>2023-12-31T15:06:00.000+01:00</published><updated>2023-12-31T15:06:19.097+01:00</updated><title type='text'>Traditional New Year post, 2023rd edition</title><content type='html'>&lt;p&gt;
Today is the last day of the year, and it&#39;s time for a traditional blog post with a review of the year.
&lt;/p&gt;

&lt;p&gt;
From the professional side, it was another busy but very interesting year with many activities across multiple areas.  For me it was primarily cloud infrastructure, security, all things automation, disaster recovery, migrations, and related areas.  I tried to reflect on this in my &lt;a href=&quot;https://www.linkedin.com/pulse/3-years-databricks-alex-ott/&quot;&gt;post on three years at Databricks&lt;/a&gt; published on LinkedIn, and it&#39;s also visible from the range of topics of &lt;a href=&quot;https://www.databricks.com/blog/author/alex-ott&quot;&gt;blog posts published this year&lt;/a&gt;.
&lt;/p&gt;

&lt;p&gt;
From my point of view the automation (cloud infra, security, DevOps &amp;amp; CI/CD, &amp;#x2026;) is a critical part of the project&#39;s success, and this was one of the most significant parts of my work.  Terraform is a robust tool for automation, and I did spend a considerable amount of time on the related work:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;More than 150 pull requests were merged into &lt;a href=&quot;https://github.com/databricks/terraform-provider-databricks&quot;&gt;Databricks Terraform provider&lt;/a&gt; - not only the new functionality or bug fixes but also quite a lot of work was done on &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter&quot;&gt;Terraform exporter&lt;/a&gt; that is heavily used for environment migrations and disaster recovery projects.&lt;/li&gt;
&lt;li&gt;In May &lt;a href=&quot;https://www.databricks.com/blog/announcing-terraform-databricks-modules&quot;&gt;we announced Terraform modules for Databricks&lt;/a&gt; - reusable code that helps customers to build their Databricks infrastructure faster, and we&#39;re working on including more modules so customers will be able just to combine necessary pieces to get their infrastructure ready to use.&lt;/li&gt;
&lt;li&gt;A lot of internal work on enablement around Terraform adoption - some parts of it will be presented in the &lt;a href=&quot;https://pages.databricks.com/databricks-specialist-sessions.html&quot;&gt;upcoming webinar&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Besides Terraform, quite a lot of work (PRs, GH issues, &amp;#x2026;) was done with the engineering team responsible for the developer ecosystem - new Databricks SDKs for Go and Python languages and the new Databricks CLI.  With these new tools, it&#39;s much easier to develop additional tools for Databricks (like &lt;a href=&quot;https://github.com/databrickslabs/sandbox/tree/main/ip_access_list_analyzer&quot;&gt;this&lt;/a&gt;) or automate some boring tasks.
&lt;/p&gt;

&lt;p&gt;
This year, a few projects related to cybersecurity kicked off, and hopefully, we&#39;ll get more work in this area where I have significant experience and where Databricks and Apache Spark are the natural fit. Modern cybersecurity is a big data domain with challenges around large-scale real-time data processing, data normalization, threat detection, and reporting.  Technologies like Delta Live Tables not only simplify development and deployment of scalable data processing pipelines, but they also include features like &lt;a href=&quot;https://www.databricks.com/blog/2022/12/08/build-reliable-and-cost-effective-streaming-data-pipelines.html&quot;&gt;enhanced autoscaling&lt;/a&gt; that allow to automatically scale pipelines up and down, providing cost-efficient way of handling spiky workloads that are natural for cybersecurity (we had that challenges back at McAfee).
&lt;/p&gt;

&lt;p&gt;
In February, Databricks celebrated ten years, and attending the company kick-off event in Las Vegas was interesting.  For me, it was a chance to finally meet people in person after working with many of them for 2.5 years.  It was also the first long-distance business trip since the pandemic began almost three years ago.  Although frankly speaking, I can&#39;t say that I miss these trips - it&#39;s interesting to meet people, but travel takes too much time, so I need to wait for teleportation :-)
&lt;/p&gt;

&lt;p&gt;
With all this, I&#39;m looking forward to what the new year will bring.  And I wish a happy New Year to all!
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/2755981010880887659/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/2755981010880887659' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2755981010880887659'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2755981010880887659'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2023/12/traditional-new-year-post-2023rd-edition.html' title='Traditional New Year post, 2023rd edition'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-6297639211475240882</id><published>2023-10-28T13:00:00.005+02:00</published><updated>2025-07-03T09:07:30.829+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta live tables"/><category scheme="http://www.blogger.com/atom/ns#" term="dlt"/><category scheme="http://www.blogger.com/atom/ns#" term="eventhubs"/><title type='text'>Delta Live Tables recipes: Consuming from Azure Event Hubs using OAuth 2.0/OIDC authentication</title><content type='html'>&lt;p&gt;Last year,&lt;a href=&quot;https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html&quot;&gt;I blogged&lt;/a&gt; about consuming data from the Azure Event Hubs with Delta Live Tables (DLT). That blog post showed how to do that using Apache Kafka client that is bundled together with Databricks Runtime that is used by DLT.&lt;/p&gt;

&lt;p&gt;That example used Shared Access Signatures (SAS) generated for a specific Event Hubs namespace or a topic. However, in many organizations, the use of SAS is prohibited because it’s a long-living token that is potentially risky to use. Instead, it’s recommended to use short-living tokens of service principals that need to be &lt;a href=&quot;https://learn.microsoft.com/en-us/entra/identity-platform/v2-oauth2-client-creds-grant-flow&quot;&gt;generated according to the OIDC/OAuth 2.0 specification&lt;/a&gt;. These tokens need to be periodically refreshed, which should be done automatically by a consumer.&lt;/p&gt;

&lt;p&gt;Before Databricks Runtime 12.2 was released earlier this year, DBR versions were using 2.x versions of Apache Kafka clients that didn’t support OAuth/OIDC authentication, so I even created a &lt;a href=&quot;https://github.com/alexott/databricks-playground/tree/main/kafka-eventhubs-aad-auth&quot;&gt;simple library&lt;/a&gt; that could be used with Databricks clusters to generate and refresh OAuth tokens. But we still had a problem using it on DLT as we can’t attach jar libraries to the DLT pipeline.&lt;/p&gt;

&lt;p&gt;Things had changed in DBR 12.2, which upgraded the Apache Kafka clients library, and it now has built-in support for OAuth 2.0/OIDC authentication flows (see &lt;a href=&quot;https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=186877575&quot;&gt;KIP-768&lt;/a&gt; for more details), so it’s now just a matter of correct configuration to start consuming from the Azure Event Hubs topic using an Azure service principal.&amp;nbsp; To make it work, we need a service principal ID, secret, and Azure Tenant ID - using this data, we can construct the correct SASL configuration string. We also need to grant the service principal a corresponding role on Azure Event Hubs (“ Azure Event Hubs Data Receiver” for reading data or “Azure Event Hubs Data Sender” for writing data).&amp;nbsp;&lt;/p&gt;

&lt;p&gt;The complete example of a DLT pipeline that consumes from Event Hubs topic looks as follows:&lt;/p&gt;

    &lt;pre&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; pyspark.sql.functions &lt;span style=&quot;color: #a020f0;&quot;&gt;as&lt;/span&gt; F
&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; dlt

&lt;span style=&quot;color: #a0522d;&quot;&gt;topic&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&amp;lt;topic&amp;gt;&quot;&lt;/span&gt;
&lt;span style=&quot;color: #a0522d;&quot;&gt;eh_namespace_name&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&amp;lt;eh_namespace_name&amp;gt;&quot;&lt;/span&gt;
&lt;span style=&quot;color: #a0522d;&quot;&gt;eh_server&lt;/span&gt; = f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;{eh_namespace_name}&lt;span style=&quot;color: #008b00;&quot;&gt;.servicebus.windows.net&quot;&lt;/span&gt;

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Data for service principal are stored in the secret scope
&lt;/span&gt;&lt;span style=&quot;color: #a0522d;&quot;&gt;tenant_id&lt;/span&gt; = dbutils.secrets.get(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;scope&quot;&lt;/span&gt;, &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;tenant_id&quot;&lt;/span&gt;)
&lt;span style=&quot;color: #a0522d;&quot;&gt;client_id&lt;/span&gt; = dbutils.secrets.get(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;scope&quot;&lt;/span&gt;, &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;sp-id&quot;&lt;/span&gt;)
&lt;span style=&quot;color: #a0522d;&quot;&gt;client_secret&lt;/span&gt; = dbutils.secrets.get(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;scope&quot;&lt;/span&gt;, &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;sp-secret&quot;&lt;/span&gt;)
&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Generate SASL configuration string (it&#39;s split to fit into the screen)
&lt;/span&gt;&lt;span style=&quot;color: #a0522d;&quot;&gt;sasl_config&lt;/span&gt; = f&lt;span style=&quot;color: #008b00;&quot;&gt;&#39;kafkashaded.org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule&#39;&lt;/span&gt; + \
  f&lt;span style=&quot;color: #008b00;&quot;&gt;&#39; required clientId=&quot;&lt;/span&gt;{client_id}&lt;span style=&quot;color: #008b00;&quot;&gt;&quot; clientSecret=&quot;&lt;/span&gt;{client_secret}&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&#39;&lt;/span&gt; + \
  f&lt;span style=&quot;color: #008b00;&quot;&gt;&#39; scope=&quot;https://&lt;/span&gt;{eh_server}&lt;span style=&quot;color: #008b00;&quot;&gt;/.default&quot; ssl.protocol=&quot;SSL&quot;;&#39;&lt;/span&gt;

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;Create Kafka options dictionary
&lt;/span&gt;&lt;span style=&quot;color: #a0522d;&quot;&gt;callback_class&lt;/span&gt; = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafkashaded.org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler&quot;&lt;/span&gt;
&lt;span style=&quot;color: #a0522d;&quot;&gt;oauth_endpoint&lt;/span&gt; = f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;https://login.microsoft.com/&lt;/span&gt;{tenant_id}&lt;span style=&quot;color: #008b00;&quot;&gt;/oauth2/v2.0/token&quot;&lt;/span&gt;
&lt;span style=&quot;color: #a0522d;&quot;&gt;kafka_options&lt;/span&gt; = {
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: f&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;&lt;/span&gt;{eh_server}&lt;span style=&quot;color: #008b00;&quot;&gt;:9093&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;subscribe&quot;&lt;/span&gt;: topic,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;startingOffsets&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;earliest&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.security.protocol&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;SASL_SSL&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.sasl.mechanism&quot;&lt;/span&gt;: &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;OAUTHBEARER&quot;&lt;/span&gt;,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.sasl.jaas.config&quot;&lt;/span&gt;: sasl_config,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.sasl.oauthbearer.token.endpoint.url&quot;&lt;/span&gt;: oauth_endpoint,
  &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka.sasl.login.callback.handler.class&quot;&lt;/span&gt;: callback_class,
}

&lt;span style=&quot;color: #0000ee;&quot;&gt;@dlt.table&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: #b22222;&quot;&gt;bronze&lt;/span&gt;():
    &lt;span style=&quot;color: #a0522d;&quot;&gt;df&lt;/span&gt; = spark.readStream.&lt;span style=&quot;color: #483d8b;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;).options(**kafka_options).load()
    &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; df.withColumn(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;value&quot;&lt;/span&gt;, F.col(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;value&quot;&lt;/span&gt;).cast(&lt;span style=&quot;color: #008b00;&quot;&gt;&quot;string&quot;&lt;/span&gt;))
&lt;/pre&gt;

&lt;p&gt;The only change necessary to make it work on Databricks is to prepend kafkashaded to the class names because the Apache Kafka client is shaded.&lt;/p&gt;</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/6297639211475240882/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/6297639211475240882' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6297639211475240882'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6297639211475240882'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2023/10/delta-live-tables-recipes-consuming.html' title='Delta Live Tables recipes: Consuming from Azure Event Hubs using OAuth 2.0/OIDC authentication'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-7600517521325239909</id><published>2022-12-31T16:31:00.002+01:00</published><updated>2023-01-01T14:28:56.384+01:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><title type='text'>Looking back to 2022nd</title><content type='html'>&lt;p&gt;
It&#39;s the last day of the year, and it&#39;s time to write a traditional &quot;year in review&quot; blog post.
&lt;/p&gt;

&lt;p&gt;
On professional side it was very intensive &amp;amp; interesting year. I&#39;m still working with customers, although my role has changed a bit - now I belong to a group of specialist solution architects, working with customers on advanced use cases in specific areas.  For me it&#39;s an interesting mix of data engineering, platform, security, data governance, devops, cybersecurity, …, and ability to work with big enterprise customers. Work with customers was tightly connected with other activities - blogging, internal &amp;amp; external knowledge sharing, contributing to internal &amp;amp; open source projects, working with product teams in releasing new functionality, etc. 
&lt;/p&gt;

&lt;p&gt;
The significant amount of work was done for &lt;a href=&quot;https://github.com/databricks/terraform-provider-databricks&quot;&gt;Databricks Terraform provider&lt;/a&gt;.  The most significant event was that &lt;a href=&quot;https://www.databricks.com/blog/2022/06/22/databricks-terraform-provider-is-now-generally-available.html&quot;&gt;Databricks Terraform provider reached version 1.0 and became a fully supported part of Databricks portfolio&lt;/a&gt;, and continues to be a &lt;a href=&quot;https://www.linkedin.com/feed/update/urn:li:activity:7009602440190676994/&quot;&gt;very popular tool between Databricks customers&lt;/a&gt;. Although the provider now is a part of the product, the field team continues actively contributing to its functionality - knowing how people are using it is a very important aspect of developing tools for end-users.  From my side, during the year there were more than 80 merged pull requests, with quite a bit of work in the last months on the &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter&quot;&gt;exporter functionality&lt;/a&gt; that allows users to quickly start to maintain existing Databricks resources with Terraform.
&lt;/p&gt;

&lt;p&gt;
Databricks Terraform provider wasn&#39;t the only open source contribution this year.  In the first half of the year I had a possibility to continue contributions to Apache Airflow, not only fixing bugs or improving existing Airflow operators, but also adding new functionality, like &lt;a href=&quot;https://www.databricks.com/blog/2022/04/29/build-data-and-ml-pipelines-more-easily-with-databricks-and-apache-airflow.html&quot;&gt;support for Databricks SQL&lt;/a&gt; that simplifies data ingestion from different data sources into Delta Lake tables.  Plus there were many contributions to projects under the &lt;a href=&quot;https://github.com/databrickslabs/&quot;&gt;Databricks Labs&lt;/a&gt; &amp;amp; &lt;a href=&quot;https://github.com/orgs/databricks/repositories&quot;&gt;Databricks&lt;/a&gt;  umbrellas, and quite a lot of work (code samples/demos/…) inside &lt;a href=&quot;https://github.com/alexott?tab=repositories&quot;&gt;personal repositories&lt;/a&gt;…
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyVAT7NFRXgW7Lz6ZfEHVZmLkbdCCj4UIgcE3qx_bkB9YoeB-EdSNvV4Qo3StVF_Cx57-Wr8kYM2v40nfDIdbsk-VbBMyFGodXuZkLkMCQO1Dh4reOAdxWuKc-Sk5OHCncueWEdp7YIT1cuzTDY-gU2_0z8X9xJpZLYuMnVRGwpyxutN-tVB4/s824/%D0%A1%D0%BD%D0%B8%D0%BC%D0%BE%D0%BA%20%D1%8D%D0%BA%D1%80%D0%B0%D0%BD%D0%B0%202022-12-31%20%D0%B2%2015.03.54.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;209&quot; data-original-width=&quot;824&quot; height=&quot;162&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyVAT7NFRXgW7Lz6ZfEHVZmLkbdCCj4UIgcE3qx_bkB9YoeB-EdSNvV4Qo3StVF_Cx57-Wr8kYM2v40nfDIdbsk-VbBMyFGodXuZkLkMCQO1Dh4reOAdxWuKc-Sk5OHCncueWEdp7YIT1cuzTDY-gU2_0z8X9xJpZLYuMnVRGwpyxutN-tVB4/w640-h162/%D0%A1%D0%BD%D0%B8%D0%BC%D0%BE%D0%BA%20%D1%8D%D0%BA%D1%80%D0%B0%D0%BD%D0%B0%202022-12-31%20%D0%B2%2015.03.54.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
This year I tried to return to blogging.  Besides &lt;a href=&quot;https://alexott.blogspot.com/2022/&quot;&gt;publishing in the personal blog&lt;/a&gt;, I managed to co-author &lt;a href=&quot;https://www.databricks.com/blog/author/alex-ott&quot;&gt;five blog posts in the company blog&lt;/a&gt; on different topics.  I&#39;m planning to continue writing in both blogs, having already few drafts in work.
&lt;/p&gt;

&lt;p&gt;
Continuing to &lt;a href=&quot;https://stackoverflow.com/users/18627/alex-ott&quot;&gt;answer on StackOverflow&lt;/a&gt; was another form of external knowledge sharing about all things Databricks, Delta Lake, Apache Spark, etc. and sometimes I hear from customers that they know me because of answers.  This year I managed to get a gold badge (score of 1000) for the &lt;a href=&quot;https://stackoverflow.com/questions/tagged/databricks&quot;&gt;databricks&lt;/a&gt; tag.
&lt;/p&gt;

&lt;p&gt;
Another thing that I managed to do this year is to get back to more cybersecurity-related work - the area where I have good practical experience.  It was in the different forms - two blog posts (&lt;a href=&quot;https://www.databricks.com/blog/2022/07/19/building-a-cybersecurity-lakehouse-for-crowdstrike-falcon-events-part-ii.html&quot;&gt;1&lt;/a&gt;, &lt;a href=&quot;https://www.databricks.com/blog/2022/12/16/building-cybersecurity-lakehouse-crowdstrike-falcon-events-part-iii.html&quot;&gt;2&lt;/a&gt;) about working with CrowdStrike data in the company blog, &lt;a href=&quot;https://alexott.blogspot.com/2022/10/ingesting-indicators-of-compromise-with.html&quot;&gt;one post&lt;/a&gt; in personal blog, writing a lot of code for ingestion &amp;amp; enrichment of different data sources (not open yet), helping customers to build cybersecurity lakehouses, …  Cybersecurity is a big data area, where Apache Spark/Databricks are a natural fit.
&lt;/p&gt;

&lt;p&gt;
There were many other things that happened during this interesting year - it&#39;s a pleasure to work surrounded by many talented colleagues, and I&#39;m looking with hope into the next year.
&lt;/p&gt;

&lt;p&gt;
Happy New Year!
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/7600517521325239909/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/7600517521325239909' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7600517521325239909'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7600517521325239909'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/12/looking-back-to-2022nd.html' title='Looking back to 2022nd'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyVAT7NFRXgW7Lz6ZfEHVZmLkbdCCj4UIgcE3qx_bkB9YoeB-EdSNvV4Qo3StVF_Cx57-Wr8kYM2v40nfDIdbsk-VbBMyFGodXuZkLkMCQO1Dh4reOAdxWuKc-Sk5OHCncueWEdp7YIT1cuzTDY-gU2_0z8X9xJpZLYuMnVRGwpyxutN-tVB4/s72-w640-h162-c/%D0%A1%D0%BD%D0%B8%D0%BC%D0%BE%D0%BA%20%D1%8D%D0%BA%D1%80%D0%B0%D0%BD%D0%B0%202022-12-31%20%D0%B2%2015.03.54.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-2382413987121046555</id><published>2022-12-22T15:41:00.006+01:00</published><updated>2025-07-03T09:07:37.410+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cicd"/><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta live tables"/><category scheme="http://www.blogger.com/atom/ns#" term="devops"/><category scheme="http://www.blogger.com/atom/ns#" term="dlt"/><category scheme="http://www.blogger.com/atom/ns#" term="testing"/><title type='text'>Delta Live Tables recipes: implementing unit &amp; integration tests, and doing CI/CD</title><content type='html'>&lt;p&gt;The extended &amp;amp; updated version of this blog post is &lt;a href=&quot;https://www.databricks.com/blog/applying-software-development-devops-best-practices-delta-live-table-pipelines&quot;&gt;published on the Databricks blog&lt;/a&gt;.

&lt;/p&gt;

</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/2382413987121046555/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/2382413987121046555' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2382413987121046555'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/2382413987121046555'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/12/delta-live-tables-recipes-implementing.html' title='Delta Live Tables recipes: implementing unit &amp; integration tests, and doing CI/CD'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-597332594309965720</id><published>2022-11-25T18:01:00.003+01:00</published><updated>2022-12-20T09:14:22.442+01:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="terraform"/><title type='text'>Cloud-agnostic resources deployment with Databricks Terraform Provider</title><content type='html'>&lt;p&gt;
&lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs&quot;&gt;Databricks Terraform Provider&lt;/a&gt; includes a number of the data sources that greatly simplify creation of portable Terraform templates.  There are few classes of data sources related to compute, user &amp;amp; group management, and other topics.  In practice, the most often used data sources are:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;&lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/node_type&quot;&gt;databricks_node_type&lt;/a&gt; together with &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/spark_version&quot;&gt;databricks_spark_version&lt;/a&gt; allow to define jobs, clusters, instance pools &amp;amp; DLT pipelines that are cloud agnostic.&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/current_user&quot;&gt;databricks_current_user&lt;/a&gt; allows to avoid hard coding of paths to notebooks in jobs &amp;amp; DLT pipelines, so it&#39;s easy to move resources between environments, or avoid names conflicts - for example, when developing a job or a DLT pipeline could be created for each of the developers, and should point to a notebook for a given user, but in production environment, this job or DLT pipeline will be owned by service principal.&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/group&quot;&gt;databricks_group&lt;/a&gt; is heavily used to refer to predefined user groups, such as, &lt;code&gt;admins&lt;/code&gt; or &lt;code&gt;users&lt;/code&gt;, for example, when setting permissions to specific resources, or when adding users as workspace administrators (you can find examples in the documentation).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Let&#39;s look at how &lt;code&gt;databricks_node_type&lt;/code&gt;, &lt;code&gt;databricks_spark_version&lt;/code&gt;, and &lt;code&gt;databricks_current_user&lt;/code&gt; could be used to create cloud agnostic Terraform templates. When you work with multiple clouds and define jobs or clusters, you need to specify node type - name of the instance type that will be used to run your code.  The problem is that these names are cloud specific, and in some cases people resolve to ugly code like &lt;code&gt;node_type_id = (var.cloud == &quot;aws&quot;) ? &quot;c5d.2xlarge&quot; : (var.cloud == &quot;azure&quot; ? &quot;Standard_F8s&quot; : &quot;c2-standard-8&quot;)&lt;/code&gt; that is hard to read &amp;amp; support (and it will break if Databricks will add support for another cloud). Also, you need to specify a Databricks Runtime (DBR) version that you want to use (the &lt;code&gt;spark_version&lt;/code&gt; parameter in cluster definition) that consists of several pieces: version itself, is it ML runtime or not, is it ML runtime for GPU or CPU, is it Photon-optimized, is it long term support version (LTS) or not, etc., for example, &lt;code&gt;11.3.x-cpu-ml-scala2.12&lt;/code&gt; or &lt;code&gt;11.3.x-photon-scala2.12&lt;/code&gt;.  Also, new versions are released regularly, and if you want to have clusters/jobs to run on the latest version, you may need to update your Terraform code after each release of new runtimes.
&lt;/p&gt;

&lt;p&gt;
And use of &lt;code&gt;databricks_node_type&lt;/code&gt; and &lt;code&gt;databricks_spark_version&lt;/code&gt; solve these problems:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;you parameterize &lt;code&gt;databricks_node_type&lt;/code&gt; by specifying what is the minimal number of cores required per node, how much memory should be per core, should it have GPU or not, category (compute or memory optimized, …), and many other parameters described in the &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/node_type&quot;&gt;documentation&lt;/a&gt;.  When executing, Databricks Terraform provider fetches the list available node types via REST API, and finds a node matching your parameters that you can use in the cluster/job definition (Warning: sometimes it can&#39;t find it if you have incompatible requirements).&lt;/li&gt;
&lt;li&gt;similarly, you tell &lt;code&gt;databricks_spark_version&lt;/code&gt; to search a DBR version matching your requirements: ML or not, with Photon or not, etc. - see &lt;a href=&quot;https://registry.terraform.io/providers/databricks/databricks/latest/docs/data-sources/spark_version&quot;&gt;documentation&lt;/a&gt; for full list.  Similarly, when invoked, Terraform provider will call corresponding REST API, and find a specific version matching your requirements (or not find, if you specify incorrect combination, like, Photon + ML).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Let&#39;s look at the specific example - deployment of a Databricks job that will execute a notebook on a job cluster. Full source code is &lt;a href=&quot;https://github.com/alexott/terraform-playground/tree/main/cloud-agnostic&quot;&gt;available on GitHub&lt;/a&gt;. It also demonstrates the use of &lt;code&gt;databricks_current_user&lt;/code&gt; data source to create user-specific name for a job, and deploy a notebook into the user&#39;s directory.
&lt;/p&gt;

&lt;p&gt;
First let select the corresponding node type for our job - here I want a node that has a local disk, has at least 8 cores, and it&#39;s compute optimized:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-terraform&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: darkslateblue;&quot;&gt;data&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;&quot;databricks_node_type&quot;&lt;/span&gt; &lt;span style=&quot;color: #cd661d;&quot;&gt;&quot;this&quot;&lt;/span&gt; {&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;local_disk&lt;/span&gt;            =&lt;span style=&quot;color: darkcyan;&quot;&gt; true&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;min_cores&lt;/span&gt;             = 8&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;category&lt;/span&gt;              = &lt;span style=&quot;color: #008b00;&quot;&gt;&quot;Compute Optimized&quot;&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
I also want to use latest Databricks ML Runtime with long term support:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-terraform&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: darkslateblue;&quot;&gt;data&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;&quot;databricks_spark_version&quot;&lt;/span&gt; &lt;span style=&quot;color: #cd661d;&quot;&gt;&quot;latest_lts&quot;&lt;/span&gt; {&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;long_term_support&lt;/span&gt; =&lt;span style=&quot;color: darkcyan;&quot;&gt; true&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: sienna;&quot;&gt;ml&lt;/span&gt;                =&lt;span style=&quot;color: darkcyan;&quot;&gt; true&lt;/span&gt;&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
Then I just refer to that data sources in my job definition:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-terraform&quot;&gt;&lt;code&gt;&lt;span style=&quot;color: darkslateblue;&quot;&gt;resource&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;&quot;databricks_job&quot;&lt;/span&gt; &lt;span style=&quot;color: #cd661d;&quot;&gt;&quot;this&quot;&lt;/span&gt; {&lt;/code&gt;
&lt;code&gt;  ...&lt;/code&gt;
&lt;code&gt;  &lt;span style=&quot;color: #0000ee;&quot;&gt;new_cluster&lt;/span&gt; {&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;num_workers&lt;/span&gt;   = 1&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;spark_version&lt;/span&gt; = data.databricks_spark_version.latest_lts.id&lt;/code&gt;
&lt;code&gt;    &lt;span style=&quot;color: sienna;&quot;&gt;node_type_id&lt;/span&gt;  = data.databricks_node_type.this.id&lt;/code&gt;
&lt;code&gt;  }&lt;/code&gt;
&lt;code&gt;  ...&lt;/code&gt;
&lt;code&gt;}&lt;/code&gt;
&lt;code&gt;&lt;/code&gt;
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
That&#39;s all! 
&lt;/p&gt;

&lt;p&gt;
Let&#39;s see what happens if I execute that code on Azure, and then compare results with AWS &amp;amp; GCP.  After job is created, let see into job cluster definition:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMU6zowBhui3CgKAq08WZWQSgxXE6Ni1TPtQUdg89y3GEbQfEP2r9YeSXqhVY7vobYwO2HpCwngXPrwb0Yv65d4kbK6YI-qC_jmq9frPhD14DfmBZgy2M4EakbL1rkUFwXc4mA3kbe_LrPeHusevD26jyn4wSVTnGi-wcaVglyIh8qJfFxjAk/s807/Screenshot%202022-11-25%20at%2016.19.30.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;793&quot; data-original-width=&quot;807&quot; height=&quot;629&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMU6zowBhui3CgKAq08WZWQSgxXE6Ni1TPtQUdg89y3GEbQfEP2r9YeSXqhVY7vobYwO2HpCwngXPrwb0Yv65d4kbK6YI-qC_jmq9frPhD14DfmBZgy2M4EakbL1rkUFwXc4mA3kbe_LrPeHusevD26jyn4wSVTnGi-wcaVglyIh8qJfFxjAk/w640-h629/Screenshot%202022-11-25%20at%2016.19.30.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
As we can see, Terraform provider has selected the &lt;code&gt;Standard_F8s&lt;/code&gt; instance type (compute optimized, with 8 cores), and selected &lt;code&gt;11.3.x-cpu-ml-scala2.12&lt;/code&gt; as runtime version (latest LTS version with ML support for execution on nodes without GPU).
&lt;/p&gt;

&lt;p&gt;
If you execute the same code on AWS, runtime version won&#39;t change, but we&#39;ll get &lt;code&gt;c5d.2xlarge&lt;/code&gt; as the node type:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEixIUkXnuyCThS_AOZ7az7z13llYz_Uxg_zPcbtrTSgKoRpRTKDQLgrfrghY1fwXU0jqLK5nRg9hVduV8smspe5hYdq_Dpm1nfI2L4FgiVF5Q9BRjBvj6qDr9mIzqv2aUP9kpeL_ZHNgLPju3BfJt4cf-XPNkMrvBag47iMVATfwK9ReLs0vaw/s636/Screenshot%202022-11-25%20at%2016.21.49.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;264&quot; data-original-width=&quot;636&quot; height=&quot;266&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEixIUkXnuyCThS_AOZ7az7z13llYz_Uxg_zPcbtrTSgKoRpRTKDQLgrfrghY1fwXU0jqLK5nRg9hVduV8smspe5hYdq_Dpm1nfI2L4FgiVF5Q9BRjBvj6qDr9mIzqv2aUP9kpeL_ZHNgLPju3BfJt4cf-XPNkMrvBag47iMVATfwK9ReLs0vaw/w640-h266/Screenshot%202022-11-25%20at%2016.21.49.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
And if we do the same on GCP, the node type will change to the &lt;code&gt;c2-standard-8&lt;/code&gt; (have you noticed that this node has 32Gb of RAM instead of 16GB on Azure &amp;amp; AWS? This happens because there were no other node with smaller amount of memory):&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
 &lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgZleU9ktuhXMm-A3GIgaF_13QHUFpBspzvEydyt6M_bO6XKsj56yKHKXIyzUR7sFIJQH8aJ4X_BP2C8myzSD5Idsi48c8FSA8ZM8mProj6ElDJXSggAccnFWvNTLHRee20rRpL2zgrIUj7FaqDRfSrGkDD1V41i28_F3Yse5pEvLkLQs4HTW0/s717/Screenshot%202022-11-25%20at%2016.23.12.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;220&quot; data-original-width=&quot;717&quot; height=&quot;196&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgZleU9ktuhXMm-A3GIgaF_13QHUFpBspzvEydyt6M_bO6XKsj56yKHKXIyzUR7sFIJQH8aJ4X_BP2C8myzSD5Idsi48c8FSA8ZM8mProj6ElDJXSggAccnFWvNTLHRee20rRpL2zgrIUj7FaqDRfSrGkDD1V41i28_F3Yse5pEvLkLQs4HTW0/w640-h196/Screenshot%202022-11-25%20at%2016.23.12.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
This blog post demonstrated that it&#39;s really easy to create Terraform code for Databricks that is easy to use on different clouds, and also avoid updating your code when new runtime versions are released.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/597332594309965720/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/597332594309965720' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/597332594309965720'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/597332594309965720'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/11/cloud-agnostic-resources-deployment.html' title='Cloud-agnostic resources deployment with Databricks Terraform Provider'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjMU6zowBhui3CgKAq08WZWQSgxXE6Ni1TPtQUdg89y3GEbQfEP2r9YeSXqhVY7vobYwO2HpCwngXPrwb0Yv65d4kbK6YI-qC_jmq9frPhD14DfmBZgy2M4EakbL1rkUFwXc4mA3kbe_LrPeHusevD26jyn4wSVTnGi-wcaVglyIh8qJfFxjAk/s72-w640-h629-c/Screenshot%202022-11-25%20at%2016.19.30.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-5911880544375363093</id><published>2022-10-21T12:33:00.000+02:00</published><updated>2022-10-21T12:33:02.942+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cybersecurity"/><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta lake"/><title type='text'>Ingesting indicators of compromise with Filebeat, Azure Event Hubs &amp; Delta Lake on Databricks</title><content type='html'>&lt;p&gt;
In cybersecurity, an &lt;a href=&quot;https://en.wikipedia.org/wiki/Indicator_of_compromise&quot;&gt;Indicator of Compromise (IoC)&lt;/a&gt; is very important piece of information that is observed on a network or in an operating system that usually indicates a computer intrusion.  Typical IoCs are things like file hashes, URLs/domains/IPs of botnet command &amp;amp; control servers, etc. Having this information we can use it to perform the real-time matching of logs &amp;amp; other data against known IoCs, or to perform investigations against historical data.  There are multiple data formats that are used to exchange information about IoCs that allows sharing this information between different parties - there are open threat exchange platforms, but there are also a few security vendors that provide high quality, curated threat feeds.
&lt;/p&gt;

&lt;p&gt;
As mentioned above, when it comes to use of IoC data we typically have two distinct use cases:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;matching IoCs against new data - this usually happens in the real-time or near-real-time fashion against the streaming data, and generated alerts are kicking-in the investigation process.  To minimize the time between generation of events/logs and generation of alerts, our tool should support efficient lookup in the IoC data.&lt;/li&gt;
&lt;li&gt;matching IoCs against historical data - typically this happens as part of the incident response process, when analysts are looking into previous activity in light of the new data.  In this case the tool should be able efficiently process huge amounts of historical data joined with IoC data.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
&lt;a href=&quot;https://spark.apache.org/&quot;&gt;Apache Spark&lt;/a&gt; in combination with &lt;a href=&quot;https://delta.io/&quot;&gt;Delta Lake&lt;/a&gt; as underlying file storage format is a perfect combination that is able to handle both of these use cases very efficiently - Spark &amp;amp; Delta support both streaming &amp;amp; batch workloads using the same code, so you don&#39;t need to duplicate the IoC data, or write different code for each of the use cases.   Additional efficiency when working with historical data could come from use of &lt;a href=&quot;https://www.databricks.com/product/databricks-sql&quot;&gt;Databricks SQL&lt;/a&gt; that allows to process big amounts of data faster due use of the &lt;a href=&quot;https://www.databricks.com/product/photon&quot;&gt;Photon engine&lt;/a&gt;.
&lt;/p&gt;

&lt;p&gt;
To make IoC data available for use we need to perform two tasks:
&lt;/p&gt;

&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;&lt;b&gt;Collect IoC data&lt;/b&gt;.  When you need to receive IoC data from the threat feeds, you usually need to scrap some REST API or something like that - this task often needs a custom code. But for popular threat exchange platforms there is an easier way to do that - you can simply use the &lt;a href=&quot;https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-module-threatintel.html&quot; target=&quot;_blank&quot;&gt;Threat Intel&lt;/a&gt; module of the &lt;a href=&quot;https://www.elastic.co/beats/filebeat&quot; target=&quot;_blank&quot;&gt;Elastic Filebeat&lt;/a&gt; - very popular, lightweight tool for shipping log data to Elasticsearch and other destinations, such as, Apache Kafka (or &lt;a href=&quot;https://azure.microsoft.com/en-us/products/event-hubs/#overview&quot; target=&quot;_blank&quot;&gt;Azure Event Hubs&lt;/a&gt; that also supports Apache Kafka protocol).&lt;/li&gt;

&lt;li&gt;&lt;b&gt;Make collected data available for consumption&lt;/b&gt;. Usually data from different threat exchange platforms come in slightly different formats, also depending on what kind of IoCs are reported (domain names vs. IPs vs. file hashes, etc.).  To access IoC data efficiently we need to transform them into a unified format.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
The rest of the article describes these two steps in more detail.
&lt;/p&gt;

&lt;h3 id=&quot;org38d7f57&quot;&gt;Setting up &amp;amp; running the Filebeat to ingest IoC data&lt;/h3&gt;
&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Setting up Filebeat to output to &lt;a href=&quot;https://azure.microsoft.com/en-us/products/event-hubs/#overview&quot; target=&quot;_blank&quot;&gt;Azure Event Hubs&lt;/a&gt; - it&#39;s easy to configure of Filebeat to ingest data into Event Hubs (full example of config file you can find &lt;a href=&quot;https://gist.github.com/alexott/3367e6757f8094ba4398b4f59fcb7887&quot; target=&quot;_blank&quot;&gt;here&lt;/a&gt;):

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;we need to make sure that we disable &lt;code&gt;output.elasticsearch&lt;/code&gt; and &lt;code&gt;output.logstash&lt;/code&gt; blocks in the &lt;code&gt;filebeat.yaml&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;and we need to modify the &lt;code&gt;output.kafka&lt;/code&gt; block as shown below, replacing values in the &lt;code&gt;&amp;lt;&amp;gt;&lt;/code&gt; with actual values:
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;&lt;code&gt;eh-namespace&lt;/code&gt; in the &lt;code&gt;hosts&lt;/code&gt; is the name of your EventHubs namespace&lt;/li&gt;
&lt;li&gt;for authentication we&#39;re using &lt;a href=&quot;https://learn.microsoft.com/en-us/azure/event-hubs/authorize-access-shared-access-signature&quot; target=&quot;_blank&quot;&gt;Shared Access Signature&lt;/a&gt; that you need to copy from Azure Portal (or get via command-line/terraform) - you need to put it into the &lt;code&gt;password&lt;/code&gt; field.   The value of &lt;code&gt;username&lt;/code&gt; is fixed and equal to &lt;code&gt;$ConnectionString&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;set value of &lt;code&gt;topic&lt;/code&gt; field to the name of EventHubs topic into which we&#39;ll ingest the data&lt;/li&gt;
&lt;li&gt;the rest of the fields should have the fixed values as specified below.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;

&lt;pre class=&quot;src src-yaml&quot;&gt;&lt;span style=&quot;color: sienna;&quot;&gt;output.kafka&lt;/span&gt;:
  &lt;span style=&quot;color: sienna;&quot;&gt;hosts&lt;/span&gt;: [&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;&amp;lt;eh-namespace&amp;gt;.servicebus.windows.net:9093&quot;&lt;/span&gt;]
  &lt;span style=&quot;color: sienna;&quot;&gt;sasl.mechanism&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;PLAIN&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;username&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;$ConnectionString&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;password&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;Endpoint=sb://&amp;lt;eh-namespace&amp;gt;.servicebus.windows.net/...&quot;&lt;/span&gt;

  &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;message topic selection + partitioning&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;topic&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;&amp;lt;topic-name&amp;gt;&#39;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;partition.round_robin&lt;/span&gt;:
    &lt;span style=&quot;color: sienna;&quot;&gt;reachable_only&lt;/span&gt;: &lt;span style=&quot;color: darkcyan;&quot;&gt;false&lt;/span&gt;

  &lt;span style=&quot;color: sienna;&quot;&gt;required_acks&lt;/span&gt;: 1
  &lt;span style=&quot;color: sienna;&quot;&gt;compression&lt;/span&gt;: none
  &lt;span style=&quot;color: sienna;&quot;&gt;ssl.enabled&lt;/span&gt;: &lt;span style=&quot;color: darkcyan;&quot;&gt;true&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;max_message_bytes&lt;/span&gt;: 1000000
&lt;/pre&gt;

&lt;li&gt;Enable the Threat Intel module - that&#39;s also a very easy task:

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;in the &lt;code&gt;filebeat.yaml&lt;/code&gt; make sure that all subsections inside the &lt;code&gt;filebeat.inputs&lt;/code&gt; are commented out.&lt;/li&gt;
&lt;li&gt;we need to enable &lt;code&gt;threatintel&lt;/code&gt; module by renaming the &lt;code&gt;modules.d/threatintel.yml.disabled&lt;/code&gt; to &lt;code&gt;modules.d/threatintel.yml&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;edit &lt;code&gt;modules.d/threatintel.yml&lt;/code&gt; to enable specific integrations.  In current article we&#39;re using following feeds: &lt;code&gt;abuseurl&lt;/code&gt;, &lt;code&gt;abusemalware&lt;/code&gt; &amp;amp; &lt;code&gt;malwarebazaar&lt;/code&gt; from &lt;a href=&quot;https://abuse.ch&quot; target=&quot;_blank&quot;&gt;Abuse.ch&lt;/a&gt;, and &lt;code&gt;otx&lt;/code&gt; from &lt;a href=&quot;https://otx.alienvault.com/&quot; target=&quot;_blank&quot;&gt;AlientVault OTX&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;&lt;/li&gt;

&lt;li&gt;&lt;a href=&quot;https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-starting.html&quot; target=&quot;_blank&quot;&gt;Start Filebeat&lt;/a&gt; - of course we can run &lt;code&gt;filebeat&lt;/code&gt; on a personal machine, but because it need to run all the time, it could be easier to run it in the cloud, where we can use something like &lt;code&gt;Standard B1ls&lt;/code&gt; (on Azure) that has enough memory to run the Filebeat process, and it will cost you less than $4/month.&lt;/li&gt;
&lt;/ol&gt;


&lt;h3 id=&quot;orgf295280&quot;&gt;Processing collected IoC data&lt;/h3&gt;
&lt;p&gt;
The previous section described how we can make IoC data published, but now we need to read them, and make them available for direct use.  To do it we need to take several things into consideration when implementing data processing:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Filebeat&#39;s Threat Intel module periodically loads data from the specified REST API endpoints, but it doesn&#39;t perform de-duplication of the data - if there are no changes in the API output, it still writes collected data into a configured sink.  The solution for this is to generate a hash of the actual payload &amp;amp; discard all duplicate events that have the same hash.&lt;/li&gt;
&lt;li&gt;Different threat feeds use different data formats, and we need to perform normalization - use the same field names, expand different hashes of the same file into individual rows for easier matching, etc.&lt;/li&gt;
&lt;li&gt;The same IoC may come via different threat feeds.  There are different ways of handling this - ignore duplicates, merge data from multiple providers, etc.  For simplicity I selected the first method - ignore duplicate submissions.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
The implementation itself is quite straightforward and follows the standard &lt;a href=&quot;https://www.databricks.com/glossary/medallion-architecture&quot; target=&quot;_blank&quot;&gt;medallion architecture&lt;/a&gt; (full source code is on &lt;a href=&quot;https://github.com/alexott/databricks-cybersecurity-playground/tree/main/iocs-ingest&quot; target=&quot;_blank&quot;&gt;GitHub&lt;/a&gt;):
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Raw data are ingested from Event Hubs into a bronze layer without much modification - we add a hash of the actual payload that is used to detect duplicates, extracting the threat feed name (the &lt;code&gt;dataset&lt;/code&gt; column), and also adding a &lt;code&gt;date&lt;/code&gt; column that is used for data partitioning.  By keeping the raw data intact we&#39;ll be able to reprocess them if necessary, or add handling of new threat feeds later.&lt;/li&gt;
&lt;li&gt;Actual data transformation happens when we ingest data into a silver layer.  The code consists of the few functions that perform decoding and normalization of data for specific threat feeds (datasets) - this data then is written into a single Delta Lake table that then is used for streaming &amp;amp; batch processing.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Current implementation is implemented using &lt;a href=&quot;https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html&quot; target=&quot;_blank&quot;&gt;Spark Structured Streaming&lt;/a&gt;, but right now it&#39;s implemented as a batch-like job using the Trigger.Once that is triggered several times per day using the &lt;a href=&quot;https://www.databricks.com/blog/2022/05/10/introducing-databricks-workflows.html&quot; target=&quot;_blank&quot;&gt;Databricks Workflows&lt;/a&gt; that looks as following:
&lt;/p&gt;

&lt;p&gt;&lt;/p&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/a/AVvXsEiAsTV-6x50qlffUcpEDLiOMkJU5n_BDTlzCyxI36jLA7plSw3nK44VRQnjjpkwt26gPt6IQXSPQ4NEHG5fAV4wwiuC3i0pKWhyvzcjwypFCYBtet1tMHBWuKGBPDJQhmYfqHy5ZbBpqYF1iKA2mAH93A2JbPxISkc5ZSlFgNgXoNl0AIVSG0E&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img alt=&quot;&quot; data-original-height=&quot;573&quot; data-original-width=&quot;413&quot; height=&quot;400&quot; src=&quot;https://blogger.googleusercontent.com/img/a/AVvXsEiAsTV-6x50qlffUcpEDLiOMkJU5n_BDTlzCyxI36jLA7plSw3nK44VRQnjjpkwt26gPt6IQXSPQ4NEHG5fAV4wwiuC3i0pKWhyvzcjwypFCYBtet1tMHBWuKGBPDJQhmYfqHy5ZbBpqYF1iKA2mAH93A2JbPxISkc5ZSlFgNgXoNl0AIVSG0E=w289-h400&quot; width=&quot;289&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;p&gt;&lt;/p&gt;


&lt;p&gt;
To reach the best performance when working with collected IoC data we need to have the correct data layout.  In the current implementation, the silver table has following structure (only main columns are listed):
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;&lt;code&gt;dataset&lt;/code&gt; (string) - from which threat feed we got this IoC.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;ioc_type&lt;/code&gt; (string) - IoC type (possible values are &lt;code&gt;URL&lt;/code&gt;, &lt;code&gt;domain&lt;/code&gt;, &lt;code&gt;hostname&lt;/code&gt;, &lt;code&gt;IPv4&lt;/code&gt;, and different file hashes in form of &lt;code&gt;FileHash-&amp;lt;hash-type&amp;gt;&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;ioc&lt;/code&gt; (string) - actual IoC value, depending on the IoC type (hash/IP/…).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;first_seen&lt;/code&gt; (timestamp) - when a given IoC was first reported.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;last_seen&lt;/code&gt; (timestamp) - when a given entry was seen last time (please note that not all threat feeds report it).&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Based on the target schema of the silver table, we can use following techniques to get best performance when working with IoC data:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;partition table by the &lt;code&gt;ioc_type&lt;/code&gt; column, so we&#39;ll read only specific data when matching specific IoC types.&lt;/li&gt;
&lt;li&gt;index the &lt;code&gt;first_seen&lt;/code&gt; &amp;amp; &lt;code&gt;last_seen&lt;/code&gt; columns so we can get advantage of the &lt;a href=&quot;https://docs.databricks.com/delta/file-mgmt.html#data-skipping-1&quot; target=&quot;_blank&quot;&gt;data skipping&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://docs.databricks.com/delta/file-mgmt.html#z-ordering-multi-dimensional-clustering&quot; target=&quot;_blank&quot;&gt;Z-Order&lt;/a&gt; data by &lt;code&gt;first_seen&lt;/code&gt; column to make data skipping even more efficient.  This is done by a maintenance task.&lt;/li&gt;
&lt;li&gt;create a &lt;a href=&quot;https://docs.databricks.com/optimizations/bloom-filters.html&quot; target=&quot;_blank&quot;&gt;bloom filter&lt;/a&gt; (currently Databricks-only) for the &lt;code&gt;ioc&lt;/code&gt; column to make joins &amp;amp; point lookups more efficient.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3 id=&quot;org043aad8&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;1.1.3.&lt;/span&gt; Use the collected IoC data&lt;/h3&gt;
&lt;p&gt;
After we prepared our IoC data, it&#39;s really easy to use them - we just need to perform a join between a dataframe with data (from stream or batch read) and our IoC table - we only need to make sure that we have input data in the correct format (the &lt;code&gt;ioc_type&lt;/code&gt; should specify type of entry (IP/file hash/…), &lt;code&gt;ioc&lt;/code&gt; - value to check, and &lt;code&gt;timestamp&lt;/code&gt; - when the event happened):
&lt;/p&gt;

&lt;pre class=&quot;src src-python&quot;&gt;&lt;span style=&quot;color: sienna;&quot;&gt;data&lt;/span&gt; = &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;input dataframe&lt;/span&gt;
&lt;span style=&quot;color: sienna;&quot;&gt;iocs&lt;/span&gt; = &lt;span style=&quot;color: #7f7f7f;&quot;&gt;# &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;dataframe with IoC data&lt;/span&gt;
&lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; = data.join(iocs, (
       (data.ioc_type == iocs.ioc_type) &amp;amp; (data.ioc == iocs.ioc) &amp;amp;
       (data.timestamp &amp;gt;= iocs.first_seen) &amp;amp;
       (data.timestamp &amp;lt;= F.coalesce(iocs.last_seen, F.current_timestamp())))) \
    .drop(data.ioc_type).drop(data.ioc)
&lt;/pre&gt;

&lt;p&gt;
And that&#39;s all.  It took less than 200 rows of the Python code to implement ingestion &amp;amp; normalization of the data for four threat feeds, and then use this data to detect potential security incidents.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/5911880544375363093/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/5911880544375363093' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5911880544375363093'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5911880544375363093'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/10/ingesting-indicators-of-compromise-with.html' title='Ingesting indicators of compromise with Filebeat, Azure Event Hubs &amp; Delta Lake on Databricks'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/a/AVvXsEiAsTV-6x50qlffUcpEDLiOMkJU5n_BDTlzCyxI36jLA7plSw3nK44VRQnjjpkwt26gPt6IQXSPQ4NEHG5fAV4wwiuC3i0pKWhyvzcjwypFCYBtet1tMHBWuKGBPDJQhmYfqHy5ZbBpqYF1iKA2mAH93A2JbPxISkc5ZSlFgNgXoNl0AIVSG0E=s72-w289-h400-c" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-7828601133081329627</id><published>2022-08-13T14:51:00.006+02:00</published><updated>2022-08-13T14:51:49.880+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><title type='text'>Reflecting on two years at Databricks...</title><content type='html'>&lt;p&gt;This Wednesday, 10th of August, was my second anniversary of working at Databricks.&amp;nbsp; Initially I planned to write this blog post on that day, but as usual, started to dig into customer work, and remembered about it only in the evening, after I went away from the keyboard.&lt;/p&gt;
&lt;p&gt;I joined Databricks professional services team in the middle of the pandemic year. All interviews were done remotely (it became normal by that time), and I was really impressed by the people who did the interviews - there were deep technical and non-technical questions, but it wasn&#39;t something that to demonstrate superiority (I&#39;ve seen such things previously). People were really excited to talk about position, working at Databricks, etc. There were multiple reasons to join Databricks:&lt;/p&gt;

&lt;ul style=&quot;text-align: left;&quot;&gt;
  &lt;li&gt;I always like Apache Spark, since I started to use it in early 2015th, and the possibility of working in the company behind Spark was really exciting. (before that, Spark was one of the decision points when I thought about joining DataStax...)&lt;/li&gt;
  &lt;li&gt;culture - few of my colleagues from DataStax were already working at Databricks, and I heard many stories about company&#39;s culture&lt;/li&gt;
  &lt;li&gt;the company was (and still) growing at a fast pace, and besides Spark, there were many other interesting products in the portfolio. And I especially wanted to get deeper into machine learning.&lt;/li&gt;
  &lt;li&gt;remote position - as a SaaS product, the amount of (potential) travel is much lower compared to the products that aren&#39;t cloud-based.&amp;nbsp; Really, as of right now, I didn&#39;t have any work-related trips, although I had a possibility of working with many customers across the whole Europe - all not leaving the comfort of my home office setup.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;During the first weeks, as I was going through the onboarding trainings, I started to get to know a wider team - not only the direct colleagues, but people from other geo locations, and different departments - product &amp;amp; engineering, pre-sales solutions architects, ... And these interactions were confirming the stories that I&#39;ve heard previously about company culture - there are a lot of very smart, but humble people, they are ready to help when you have questions or problems (especially during the onboarding), they are open for suggestions, you can reach people across org boundaries, discuss something with high management, ...&amp;nbsp; And it keeps the same after two years, even though the company grew very significantly (when I joined we had less than 1,500 across the globe, and now we&#39;re close to 4,000).&lt;/p&gt;

&lt;p&gt;The pace of the product development inside the company is very high - looking back, I can see how many things were added or heavily changed even since the last year, not even talking about two years ago. Databricks SQL, Delta Live Tables, Databricks Repos, Unity Catalog, just to name a few - these things are making life of our customers easier, allowing them to concentrate on solving their business problems, not trying to reinvent the wheel of running Spark &amp;amp; other things themselves.&amp;nbsp; This makes work very interesting, although sometimes you can feel a kind of information overload, when you&#39;re trying to cover all areas of your interest.&lt;/p&gt;
&lt;p&gt;Often, when I&#39;m talking with people outside of the Databricks, they have an impression that my work is primarily around Spark (data engineering) and machine learning. &amp;nbsp; But reality is quite different - reliable &amp;amp; scalable data engineering and machine learning aren&#39;t possible until you have a solid foundation of automation (cloud infra/data/ml/dev ops), security/compliance, and related things.&amp;nbsp; As result, a big chunk of our work is spent around deployment planning (for Databricks and other cloud infrastructure), security, building CI/CD pipelines, and related topics. These things are the base on which customer&#39;s teams can build their data and machine learning products.&amp;nbsp; And, almost from the beginning, I&#39;ve started to contribute to the &lt;a href=&quot;https://github.com/databricks/terraform-provider-databricks&quot; target=&quot;_blank&quot;&gt;Databricks Terraform provider&lt;/a&gt; that is used by the significant number of Databricks customers to automate their deployments. And I want specifically mention &lt;a href=&quot;https://github.com/nfx&quot; target=&quot;_blank&quot;&gt;Serge Smertin&lt;/a&gt; who leads the development effort of terraform provider (and many other projects) - I learned many new things from him, and always was amused by his relentless push for making things powerful but easy to use.&amp;nbsp; With the similar goal of helping customers to automate, I&#39;ve started to contribute to Apache Airflow, so now it&#39;s not only possible just to run Databricks jobs, but you can query the data, import new data sets, and do many other things using the Databricks SQL.&amp;nbsp; And besides of this, there were many other things done that allow to simplify work with Databricks, for example, &lt;a href=&quot;https://github.com/alexott/databricks-nutter-repos-demo&quot; target=&quot;_blank&quot;&gt;testing of code in Databricks notebooks with Nutter&lt;/a&gt;, a lot of code snippets demonstrating different aspects of the platform (check &lt;a href=&quot;https://github.com/alexott/spark-playground&quot; target=&quot;_blank&quot;&gt;spark-playground&lt;/a&gt; &amp;amp; &lt;a href=&quot;https://github.com/alexott/databricks-playground&quot; target=&quot;_blank&quot;&gt;databricks-playground&lt;/a&gt; repositories on GitHub), etc.&lt;/p&gt;

&lt;p&gt;Working in a quickly growing company gives you a lot of possibilities to contribute to its success. These contributions could come in the different forms, like, sharing knowledge internally (in form of SME groups, presentations, creating new workshops, ...) &amp;amp; externally (i.e., I published &lt;a href=&quot;https://www.databricks.com/blog/author/alex-ott&quot; target=&quot;_blank&quot;&gt;three blog posts&lt;/a&gt; in the company&#39;s blog, answering on Databricks Community &amp;amp; StackOverflow), working closely with product &amp;amp; engineering on new functionality, simplifying internal processes, contributing to open source, ...&amp;nbsp; But most important is that these contributions are recognized, allowing career growth, switching to a new role if you want to try a new area, etc.&lt;/p&gt;

&lt;p&gt;All these things could be summarised as follows - decision to join Databricks was one of the best decisions so far, and I&#39;m looking forward to more things happening there...&lt;/p&gt;</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/7828601133081329627/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/7828601133081329627' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7828601133081329627'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/7828601133081329627'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/08/reflecting-on-two-years-at-databricks.html' title='Reflecting on two years at Databricks...'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-1374007020476928648</id><published>2022-06-19T16:30:00.004+02:00</published><updated>2022-06-19T18:34:25.872+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="delta live tables"/><category scheme="http://www.blogger.com/atom/ns#" term="dlt"/><category scheme="http://www.blogger.com/atom/ns#" term="eventhubs"/><category scheme="http://www.blogger.com/atom/ns#" term="kafka"/><title type='text'>Delta Live Tables recipes: Consuming from Azure Event Hubs</title><content type='html'>&lt;p&gt;
&lt;a href=&quot;https://docs.databricks.com/data-engineering/delta-live-tables/index.html&quot;&gt;Databricks Delta Live Tables&lt;/a&gt; (DLT) is a new framework from Databricks aimed on simplifying building reliable &amp;amp; maintainable data processing pipelines.  With this framework developers are concentrating on writing data transformations themselves, linking them together, and Delta Live Tables handles task orchestration, cluster management,  error handling, monitoring, and data quality.   Delta Live Tables supports both batch &amp;amp; streaming workloads, supporting all data formats &amp;amp; input sources included into a Databricks Runtime (DBR).
&lt;/p&gt;

&lt;p&gt;
On Azure, &lt;a href=&quot;https://docs.microsoft.com/en-us/azure/event-hubs/&quot;&gt;Event Hubs&lt;/a&gt;&amp;nbsp;(often spelled as EventHubs) is a popular solution for events transportation, similar to Apache Kafka, so when it comes to building solutions on Azure, the Event Hubs is a natural choice.  There is a &lt;a href=&quot;https://github.com/Azure/azure-event-hubs-spark&quot;&gt;Spark connector for Event Hubs&lt;/a&gt;, but right now it&#39;s not included into Databricks Runtime, and DLT doesn&#39;t allow (yet) to attach 3rd party Java libraries to a DLT pipeline.
&lt;/p&gt;

&lt;p&gt;
But there is a workaround for that problem - Azure Event Hubs &lt;a href=&quot;https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-for-kafka-ecosystem-overview&quot;&gt;provides an endpoint compatible with Apache Kafka protocol&lt;/a&gt;, so we can work with Event Hub topics using the Apache Kafka connector that is included into a Databricks Runtime.  We just need to follow the instructions in the &lt;a href=&quot;https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial&quot;&gt;official documentation&lt;/a&gt; with small changes, specific to DBR:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;we need to get &lt;a href=&quot;https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-get-connection-string&quot;&gt;Shared Access Signatures (SAS)&lt;/a&gt; to authenticate to Event Hubs topic - it should look like &lt;code&gt;Endpoint=sb://&amp;lt;....&amp;gt;.windows.net/;?...&lt;/code&gt; and will be used as a password.   For security reasons it&#39;s recommended to put it into a Databricks secret scope (update variables &lt;code&gt;secret_scope&lt;/code&gt; and &lt;code&gt;secret_name&lt;/code&gt; with your actual values).&lt;/li&gt;
&lt;li&gt;we need to form the correct string (the &lt;code&gt;eh_sasl&lt;/code&gt; variable) for SASL (&lt;a href=&quot;https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer&quot;&gt;Simple Authentication and Security Layer&lt;/a&gt;) authentication - as a user name we&#39;re using static value &lt;code&gt;$ConnectionString&lt;/code&gt;, and Event Hubs SAS is used as a password. SASL string looks a bit different on Databricks - instead of &lt;code&gt;org.apache.kafka.common.security.plain.PlainLoginModule...&lt;/code&gt; it should be prefixed with &lt;code&gt;kafkashaded.&lt;/code&gt; as the original Java package is shaded to avoid conflicts with other packages.&lt;/li&gt;
&lt;li&gt;you need to provide the name of the Event Hubs namespace &amp;amp; topic from which to read data in &lt;span style=&quot;font-family: courier;&quot;&gt;&lt;code&gt;eh_namespace_name&lt;/code&gt;&lt;/span&gt; and &lt;code&gt;topic_name&lt;/code&gt; variables.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
The final solution looks as following:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-python&quot;&gt;&lt;span style=&quot;color: forestgreen;&quot;&gt;@dlt.table&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;def&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;eventhubs_topic1&lt;/span&gt;():
  &lt;span style=&quot;color: sienna;&quot;&gt;secret_scope&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;scope&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;secret_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;eventhub_sas&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;topic_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;topic1&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;eh_namespace_name&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;&amp;lt;eh-ns-name&amp;gt;&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;readConnectionString&lt;/span&gt; = dbutils.secrets.get(secret_scope, secret_name)
   &lt;span style=&quot;color: sienna;&quot;&gt;eh_sasl&lt;/span&gt; = &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule&#39;&lt;/span&gt; \
    + f&lt;span style=&quot;color: #8b2252;&quot;&gt;&#39; &lt;/span&gt;&lt;span style=&quot;color: #8b2252;&quot;&gt;&lt;span style=&quot;color: #8b2252;&quot;&gt;required &lt;/span&gt;username=&quot;$ConnectionString&quot; password=&quot;{readConnectionString}&quot;;&#39;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;bootstrap_servers&lt;/span&gt; = f&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;{eh_namespace_name}.servicebus.windows.net:9093&quot;&lt;/span&gt;
  &lt;span style=&quot;color: sienna;&quot;&gt;kafka_options&lt;/span&gt; = {
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;: bootstrap_servers,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.sasl.mechanism&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;PLAIN&quot;&lt;/span&gt;,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.security.protocol&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;SASL_SSL&quot;&lt;/span&gt;,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.request.timeout.ms&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;60000&quot;&lt;/span&gt;,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.session.timeout.ms&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;30000&quot;&lt;/span&gt;,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;startingOffsets&quot;&lt;/span&gt;: &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;earliest&quot;&lt;/span&gt;,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.sasl.jaas.config&quot;&lt;/span&gt;: eh_sasl,
     &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;subscribe&quot;&lt;/span&gt;: topic_name,
  }
  &lt;span style=&quot;color: #a020f0;&quot;&gt;return&lt;/span&gt; spark.readStream.&lt;span style=&quot;color: darkslateblue;&quot;&gt;format&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;) \ 
    .options(**kafka_options).load()
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
With it you can refer to your DLT table by name &lt;code&gt;eventhubs_topic1&lt;/code&gt; in the &lt;code&gt;dlt.read&lt;/code&gt; or &lt;code&gt;dlt.read_stream&lt;/code&gt; functions.  An example of using similar code can be seen in the image of a real DLT pipeline that I&#39;m using for processing of threat feeds (there will be a separate post on that topic) - the &lt;span style=&quot;font-family: courier;&quot;&gt;&lt;code&gt;threatintel_bronze&lt;/code&gt;&lt;/span&gt; consumes data from the Event Hubs.&amp;nbsp;&lt;/p&gt;&lt;p&gt;&amp;nbsp;&lt;/p&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyLA1BAkoGjl3nEA5zkg0fJVEWkZLi79f02w4gZJ99Gq2N5mtNdCKINJ11XxIWQdcE-aZMTWd4qgFH5ZOTdu_ZAN-039Agf0ARGaenNBKHfkyCR48vbsz5FQg-hCkDJ35ZgN6KifcwElOZrBifwqhAjw6xpJ7YyEOt5pvPzssPzJoRZKvNsOY/s2434/Screenshot%202022-06-19%20at%2016.22.58.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;496&quot; data-original-width=&quot;2434&quot; height=&quot;130&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyLA1BAkoGjl3nEA5zkg0fJVEWkZLi79f02w4gZJ99Gq2N5mtNdCKINJ11XxIWQdcE-aZMTWd4qgFH5ZOTdu_ZAN-039Agf0ARGaenNBKHfkyCR48vbsz5FQg-hCkDJ35ZgN6KifcwElOZrBifwqhAjw6xpJ7YyEOt5pvPzssPzJoRZKvNsOY/w640-h130/Screenshot%202022-06-19%20at%2016.22.58.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
There are also additional benefits in using the Apache Kafka connector. The biggest one is that the original Event Hubs connector requires 1-to-1 mapping between partitions in Event Hubs topic and Spark partitions.  This means that if you have more CPU cores than partitions in Event Hubs topic, then your cluster resources will be used only partially, so you will spend money doing nothing.  In Apache Kafka connector, the &lt;code&gt;minPartitions&lt;/code&gt; parameter allows to specify desired number of Spark partitions, and connector will split existing Kafka/Event Hubs partitions into subranges, allowing creation of Spark partitions without 1-to-1 mapping.  And this greatly improves cluster utilization.  Stay tuned for a separate blog post on optimization of Spark + Event Hubs combo.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/1374007020476928648/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/1374007020476928648' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1374007020476928648'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1374007020476928648'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html' title='Delta Live Tables recipes: Consuming from Azure Event Hubs'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjyLA1BAkoGjl3nEA5zkg0fJVEWkZLi79f02w4gZJ99Gq2N5mtNdCKINJ11XxIWQdcE-aZMTWd4qgFH5ZOTdu_ZAN-039Agf0ARGaenNBKHfkyCR48vbsz5FQg-hCkDJ35ZgN6KifcwElOZrBifwqhAjw6xpJ7YyEOt5pvPzssPzJoRZKvNsOY/s72-w640-h130-c/Screenshot%202022-06-19%20at%2016.22.58.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-5587780599743263670</id><published>2021-12-31T18:43:00.002+01:00</published><updated>2021-12-31T18:43:17.383+01:00</updated><title type='text'>Goodbye 2021st...</title><content type='html'>&lt;p&gt;&amp;nbsp;As usual, 31st December is a good time to look back on the year behind.&amp;nbsp; This year flew by very fast, filled with many things both professional &amp;amp; personal.&lt;br /&gt;&lt;br /&gt;On the professional side there was a lot of activity - many different clients from small to huge, and interesting projects around very different things - architecture &amp;amp; implementation, security, automation/infrastructure, data quality, scaling the data processing (from an organisational perspective), improving development processes, etc.&amp;nbsp; I&#39;ll try to find time to write about lessons in some of these areas.&amp;nbsp; It&#39;s interesting that &lt;a href=&quot;https://github.com/alexott/databricks-nutter-projects-demo&quot;&gt;one demo&lt;/a&gt; that demonstrates how to do automated testing of Databricks notebooks (I developed it for my CI/CD workshop) is 2nd most popular of my Github repositories.&lt;br /&gt;&lt;br /&gt;As time allowed, I tried to continue to contribute to OSS.&amp;nbsp; The most significant OSS contribution was to &lt;a href=&quot;https://github.com/databrickslabs/terraform-provider-databricks&quot;&gt;Databricks Terraform Provider&lt;/a&gt; - around 10k lines of Go code.&amp;nbsp; Another big part of activity was to the &lt;a href=&quot;https://github.com/databrickslabs/overwatch&quot;&gt;project Overwatch&lt;/a&gt; that simplifies collection &amp;amp; analysis of data from Databricks workspaces to find problematic usage of resources, analyse costs, etc.&amp;nbsp; And on top of that, quite many small activities (PRs, issues, etc.) to fix bugs in documentation, port some components to Spark 3, etc.&amp;nbsp; Hopefully, I&#39;ll continue to work on OSS stuff at the same scale.&lt;br /&gt;&lt;br /&gt;From a personal side, the second pandemic year didn&#39;t allow a return back to &quot;normal life&quot;.&amp;nbsp; But we still managed to travel two times (it was quite a relief after almost 1.5 year since a &quot;normal&quot; vacation).&amp;nbsp; This year, I finally managed to complete my &lt;a href=&quot;https://www.goodreads.com/user_challenges/25194547&quot;&gt;reading challenge&lt;/a&gt; - primarily because of travelling just with Kindle, without distraction from iPad/laptop.&lt;br /&gt;&lt;br /&gt;I wish Happy New Year to everyone! And be safe!&lt;/p&gt;</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/5587780599743263670/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/5587780599743263670' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5587780599743263670'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5587780599743263670'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2021/12/goodbye-2021st.html' title='Goodbye 2021st...'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-5998033149993842880</id><published>2020-12-31T17:47:00.000+01:00</published><updated>2020-12-31T17:47:14.574+01:00</updated><title type='text'>Looking back to 2020th</title><content type='html'>&lt;p&gt;
The last day of the year is a good opportunity to look back on what happened during the current year.
&lt;/p&gt;

&lt;p&gt;
The pandemic changed our life significantly, and I&#39;m not the exception, although maybe not so cardinally as for others - I&#39;m working mostly remotely for the last three years, with periodic trips to customers.  And this was the most significant change for me this year - everything became virtual in a very short period of time, without visits to customers onsite.  Although, in the first two months of the year I traveled a lot - almost half of the mileage for the 2019th.  The biggest effect of this switch to virtual was on trainings that I did for customers - if you can collaborate with people remotely when investigating some problems, discussing implementations, etc., with trainings that&#39;s different - it&#39;s harder to see if people understands what you&#39;re teaching, as you don&#39;t see reactions - this required to change an approach to teaching, including materials that are presented…
&lt;/p&gt;

&lt;p&gt;
The biggest change that happened this year is the new job - after interesting years at DataStax, I went to Databricks, to a similar position - helping customers to build solutions on the top of the Databricks platform.  Databricks is well known as a company behind Spark, but it&#39;s not only Spark - MLflow &amp;amp; Delta Lake are very popular &amp;amp; powerful technologies for building data processing &amp;amp; machine learning solutions.  And inside Databricks, all of them are getting new functionality faster, before release to the open source.  And being a cloud platform, Databricks made it easier working with customers during pandemic - you aren&#39;t required to be onsite to help people.  Overall, it&#39;s very interesting to be in a fast growing company, with a lot of really smart people around, so you can learn a lot.  Plus, I got much more exposure to the Azure &amp;amp; AWS services that I didn&#39;t touch much before.  One of the interesting things to observe is that Spark is traditionally associated with Scala, but in practice I&#39;m writing much more code in Python/PySpark :-)
&lt;/p&gt;

&lt;p&gt;
This year I again didn&#39;t make my reading challenge - I set it to 55 books, like the last year, but read only 40 (it was 46 in 2019th) - this also was a side effect of the not traveling so much, plus a job change (but I read a lot of documentation :-).  One of the book-related activities was a technical proofreading of several books by O&#39;Reilly &amp;amp; Manning (I worked with Manning before on several books):
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Cassandra. The Definitive Guide, 3rd edition&lt;/li&gt;
&lt;li&gt;The Practitioner&#39;s Guide to Graph Data&lt;/li&gt;
&lt;li&gt;Graph Databases in Action&lt;/li&gt;
&lt;li&gt;Graph-Powered Machine Learning (not released yet)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
Programming related activity was spread between open source &amp;amp; internal projects.  On the open source front, I became the committer for &lt;a href=&quot;https://zeppelin.apache.org/&quot;&gt;Apache Zeppelin&lt;/a&gt;, primarily improving support for Cassandra (more details are &lt;a href=&quot;https://alexott.blogspot.com/2020/07/new-functionality-of-cassandra.html&quot;&gt;in this blog post&lt;/a&gt;), but with the job change that was put on hold.  But at the new job I suddenly started to write in Go again, contributing to the &lt;a href=&quot;https://github.com/databrickslabs/terraform-provider-databricks&quot;&gt;Terraform provider for Databricks&lt;/a&gt;.  Besides that, there were a lot of small contributions to multiple OSS projects, including the several &lt;a href=&quot;https://github.com/DataStax-Toolkit/&quot;&gt;open sourced projects&lt;/a&gt; at DataStax that just make life of administrators easier.
&lt;/p&gt;

&lt;p&gt;
This year also was more productive than previous around writing.  I wrote (with co-authors) two blog posts for DataStax&#39;s blog (&lt;a href=&quot;https://www.datastax.com/blog/advanced-apache-cassandra-analytics-now-open-all&quot;&gt;1&lt;/a&gt;, &lt;a href=&quot;https://www.datastax.com/blog/migrate-cassandra-apps-cloud-20-lines-code&quot;&gt;2&lt;/a&gt;), and seven blog posts for my &lt;a href=&quot;https://alexott.blogspot.com/&quot;&gt;own blog&lt;/a&gt; around &lt;a href=&quot;https://alexott.blogspot.com/search/label/cassandra&quot;&gt;Cassandra&lt;/a&gt;, &lt;a href=&quot;https://alexott.blogspot.com/search/label/spark&quot;&gt;Spark&lt;/a&gt;, &lt;a href=&quot;https://alexott.blogspot.com/search/label/zeppelin&quot;&gt;Zeppelin&lt;/a&gt;, &lt;a href=&quot;https://alexott.blogspot.com/search/label/datastax&quot;&gt;DataStax&lt;/a&gt;, and &lt;a href=&quot;https://alexott.blogspot.com/search/label/databricks&quot;&gt;Databricks&lt;/a&gt;.  And I have drafts for several blog posts that I&#39;m planning to publish early next year.
&lt;/p&gt;

&lt;p&gt;
And many other things happened during the year, but I don&#39;t want to list everything here :-)
&lt;/p&gt;

&lt;p&gt;
I wish you everyone a happy &amp;amp; prosperous New Year! And stay healthy - this is the main thing right now. 
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/5998033149993842880/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/5998033149993842880' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5998033149993842880'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5998033149993842880'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/12/looking-back-to-2020th.html' title='Looking back to 2020th'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-3094407075655152998</id><published>2020-07-31T12:50:00.005+02:00</published><updated>2020-07-31T12:51:18.243+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="datastax"/><category scheme="http://www.blogger.com/atom/ns#" term="spark"/><category scheme="http://www.blogger.com/atom/ns#" term="zeppelin"/><title type='text'>Running Apache Zeppelin on DSE Analytics</title><content type='html'>&lt;p&gt;
DataStax Enterprise (DSE) includes the modified version of Apache Spark branded as DSE Analytics.  This version has increased fault tolerance, doesn&#39;t rely on the Zookeeper, and has many additional optimizations &amp;amp; enhancements for work with Cassandra. It also includes a Hadoop-compatible distributed file system - DSEFS.  And I already wrote about &lt;a href=&quot;https://alexott.blogspot.com/2020/07/using-zeppelin-to-work-with-data-in-dse.html&quot;&gt;using Zeppelin with another component of DSE Analytics - AlwaysOn SQL Service&lt;/a&gt;.
&lt;/p&gt;

&lt;p&gt;
In this post I want to discuss how we can use Apache Zeppelin to run on the DSE Analytics, allowing us to use the Spark resources that we already have in the DSE cluster.  If we just need access data in DSE from Spark, without running Spark code in DSE Analytics, then we just need to configure Zeppelin to use &lt;a href=&quot;https://www.datastax.com/blog/2020/05/advanced-apache-cassandra-analytics-now-open-all&quot;&gt;recently released Spark Cassandra Connector 2.5&lt;/a&gt; as it has better compatibility with DSE.  For this post I used Zeppelin 0.9-preview2 with DSE 6.8.1 that includes Spark 2.4.
&lt;/p&gt;

&lt;p&gt;
To run Zeppelin on DSE Analytics we have two options that are described in corresponding sections:
&lt;/p&gt;
&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Execute Zeppelin directly on the node of DSE cluster - this is the easiest way, but not very good from a security standpoint, adding more load to the DSE node, etc.&lt;/li&gt;
&lt;li&gt;Execute Zeppelin on some other node that have access to the DSE cluster - this solves security and other problems, but requires more work to setup&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;
In both cases we&#39;re relying on the code shipped in DSE, and we don&#39;t need to explicitly install Spark Cassandra Connector.
&lt;/p&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org60055c6&quot;&gt;
&lt;h4 id=&quot;org60055c6&quot;&gt;Running Zeppelin on DSE node(s)&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-5-1&quot;&gt;
&lt;p&gt;
This is the most straightforward way to run Zeppelin &amp;amp; get access to DSE Analytics, DSEFS, etc.  The procedure is simple:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Start Zeppelin as &lt;code&gt;dse exec path_to_zeppelin.sh&lt;/code&gt; on one of the nodes inside DSE Analytics data center. &lt;a href=&quot;https://docs.datastax.com/en/dse/5.1/dse-dev/datastax_enterprise/spark/thirdPartyToolsIntegrationSpark.html#Zeppelinintegration&quot;&gt;dse exec&lt;/a&gt; will setup all necessary parameters - &lt;code&gt;CLASSPATH&lt;/code&gt;, etc., so Zeppelin will pick up all necessary jars that are necessary to submit jobs to the DSE Analytics&lt;/li&gt;
&lt;li&gt;In Zeppelin UI change the Spark interpreter settings. Change the &lt;code&gt;spark.master&lt;/code&gt; (&lt;code&gt;master&lt;/code&gt; in the Zeppelin 0.8)  parameter to &lt;code&gt;dse://?&lt;/code&gt; instead of default &lt;code&gt;local[*]&lt;/code&gt; - this will force Zeppelin to execute jobs on DSE Analytics, with all its advantages, like, automatic registration of DSE tables in Spark SQL, access to DSEFS, etc.&lt;/li&gt;
&lt;/ul&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgi1rXv-CW9ho-Q0E1m8ArPbngvS6ZadRwmKnv7RoVQ_ro6tiz3rxrZX1v-boopH1O_f8-P3mxaW7JYIJgeGEL6GwQtaGAQK83Wk6aeK4JRTPCZ1RBNA7aDD7ILi5CxQkr3fDf0gg/s1132/zeppelin-dse-analytics-1-configure.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;351&quot; data-original-width=&quot;1132&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgi1rXv-CW9ho-Q0E1m8ArPbngvS6ZadRwmKnv7RoVQ_ro6tiz3rxrZX1v-boopH1O_f8-P3mxaW7JYIJgeGEL6GwQtaGAQK83Wk6aeK4JRTPCZ1RBNA7aDD7ILi5CxQkr3fDf0gg/d/zeppelin-dse-analytics-1-configure.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
After configuration is changed, we can execute Spark code to read data from DSE, write to DSEFS, execute Spark SQL queries (and we don&#39;t need to explicitly register Cassandra tables!), etc.:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTgqGI335OhhL1wIEaJhYEQ8wORmIH9IImh4hyphenhyphenSIrPjsbKZKF1lFsAQAJjYimbB_LXnLVWNN0LyrWv10Ba4CLCCx1s3hCKmC4cYGRRmjL4thvY_sL1mnIPJMX6Zo80tWU9DgPMBQ/s1115/zeppelin-dse-analytics-1-notebook.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;833&quot; data-original-width=&quot;1115&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiTgqGI335OhhL1wIEaJhYEQ8wORmIH9IImh4hyphenhyphenSIrPjsbKZKF1lFsAQAJjYimbB_LXnLVWNN0LyrWv10Ba4CLCCx1s3hCKmC4cYGRRmjL4thvY_sL1mnIPJMX6Zo80tWU9DgPMBQ/d/zeppelin-dse-analytics-1-notebook.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
And we can see in the Spark Master of DSE Analytics that Zeppelin is really executed there:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjXdyUTAzuamsnaXtxVDClaOPXya-h3vusAZA6Hhjg02aRnzEX_dnaWHA6pOTgjDntgyERKZuZBEqM75hUQM3GAw696XlpWp35BBF1m874AWINmmxiIlfjRv3A5DS8AfDO5OouzQg/s1116/zeppelin-dse-analytics-1-spark-master.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;756&quot; data-original-width=&quot;1116&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjXdyUTAzuamsnaXtxVDClaOPXya-h3vusAZA6Hhjg02aRnzEX_dnaWHA6pOTgjDntgyERKZuZBEqM75hUQM3GAw696XlpWp35BBF1m874AWINmmxiIlfjRv3A5DS8AfDO5OouzQg/d/zeppelin-dse-analytics-1-spark-master.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orga9caecc&quot;&gt;
&lt;h4 id=&quot;orga9caecc&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;&lt;/span&gt;Running Zeppelin with DSE Analytics outside of DSE Cluster&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-5-2&quot;&gt;
&lt;p&gt;
Sometimes, it&#39;s undesirable to run Zeppelin on the DSE node directly due to many reasons - resource consumption, security concerns (for example, people may get access to files via shell interpreter or other means), etc.  In this case we can still have benefits of running Zeppelin via &lt;code&gt;dse exec&lt;/code&gt; - we just need to do following:
&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;download DSE distribution and unpack it on the machine where we want to run Zeppelin - you don&#39;t need to configure or start anything.  We just need DSE-specific jar files to be able to&lt;/li&gt;
&lt;li&gt;start Zeppelin via &lt;code&gt;dse exec&lt;/code&gt; as before&lt;/li&gt;
&lt;li&gt;configure it to run on DSE Analytics, but we&#39;ll need to make more operations to achieve this:
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;we need to obtain an IP address of Spark master - this is could be done either by looking for Spark master IP in output of &lt;code&gt;dsetool status&lt;/code&gt;, or we can use &lt;a href=&quot;https://docs.datastax.com/en/dse/6.8/dse-dev/datastax_enterprise/tools/dseClientTool/dseClientToolcommands/dseClient-toolSpark.html&quot;&gt;dse client-tool spark master-address&lt;/a&gt; - this option would be even easier for automatic configuration of the Zeppelin, because it will print complete URI of Spark master&lt;/li&gt;
&lt;li&gt;change &lt;code&gt;spark.master&lt;/code&gt; parameter to value obtained via &lt;code&gt;dse client-tool spark master-address&lt;/code&gt; - it should be at least &lt;code&gt;dse://&amp;lt;Master-IP&amp;gt;?&lt;/code&gt;, or with more parameters like &lt;code&gt;connection.host&lt;/code&gt; and &lt;code&gt;local_dc&lt;/code&gt;. For example: &lt;code&gt;dse://10.121.34.176:9042?connection.local_dc=SearchAnalytics;connection.host=10.121.34.94,10.121.33.133&lt;/code&gt;; &lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjmf8P67F3qh4A2PvR8hYzR5tI01Aat6JQroCJGBKw0FKuXrAKAaxwIrkHCSEjskmCpoFVQIBtmD5YNc1rIHeJUrH3oziDOjvbmKvgHAq25KOLWtal95NbWgNkVcQXXoOtkQs_KNw/s1094/zeppelin-dse-analytics-2-configure-1.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;381&quot; data-original-width=&quot;1094&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjmf8P67F3qh4A2PvR8hYzR5tI01Aat6JQroCJGBKw0FKuXrAKAaxwIrkHCSEjskmCpoFVQIBtmD5YNc1rIHeJUrH3oziDOjvbmKvgHAq25KOLWtal95NbWgNkVcQXXoOtkQs_KNw/d/zeppelin-dse-analytics-2-configure-1.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;if there is no &lt;code&gt;connection.host&lt;/code&gt; in the Spark master URI, then you need to add the &lt;code&gt;spark.cassandra.connection.host&lt;/code&gt; property, and put there a comma-separated list of DSE nodes&lt;/li&gt;
&lt;li&gt;if necessary, add other properties specific for Spark Cassandra Connector and DSE Analytics.  We can obtain them by executing &lt;a href=&quot;https://docs.datastax.com/en/dse/6.8/dse-dev/datastax_enterprise/tools/dseClientTool/dseClientToolcommands/dseClient-toolConfigurationByos-export.html&quot;&gt;dse client-tool configuration byos-export&lt;/a&gt; command. Usually these are properties related to security, but we can specify any additional property specific for the Spark Cassandra Connector, like, username and passwords, or performance tuning options&lt;/li&gt;
&lt;li&gt;to work with DSEFS as the default file system we can specify the Hadoop option &lt;code&gt;spark.hadoop.fs.defaultFS&lt;/code&gt; with value of &lt;code&gt;dsefs://&amp;lt;DSE_NODE_IP&amp;gt;&lt;/code&gt;.  This is not strictly required, we still can use DSEFS but we&#39;ll need to specify node address in the path, like, &lt;code&gt;dsefs://192.168.0.10/file.csv&lt;/code&gt; (see screenshot below)&lt;/li&gt;&lt;/ul&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEin_T5WCVsAr4EbBQypa18QvmTDCkD6i74i1C3SYHs9cjzLEBAA2CQarEThtK1JEuYC-yvogdd1LsQr23vg0MEFX8GfKZI24lA-TDVNsrnknKwjpjncEnCLf1AiB6rYPhdKZCeX4g/s1082/zeppelin-dse-analytics-2-configure-2.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;91&quot; data-original-width=&quot;1082&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEin_T5WCVsAr4EbBQypa18QvmTDCkD6i74i1C3SYHs9cjzLEBAA2CQarEThtK1JEuYC-yvogdd1LsQr23vg0MEFX8GfKZI24lA-TDVNsrnknKwjpjncEnCLf1AiB6rYPhdKZCeX4g/d/zeppelin-dse-analytics-2-configure-2.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;/div&gt;&lt;ul class=&quot;org-ul&quot;&gt;
&lt;/ul&gt;

&lt;p&gt;
After everything is configured, we can execute our code. Result will be the same, we&#39;ll get the Zeppelin process running on DSE Analytics, and we&#39;ll have full access to data. And we can use DSEFS as well - we can write data to DSEFS using explicit or implicit filesystem:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZjdDFdBYj93H3qwVDMUjUBCBrv3hPjTTgC7RijYl8WZ-faj5RexR4-zq8jPb-0dm1AX4ud6va19UdEJJMb-63hcLdiTJfVzIZTlK2liJuk86NYyqt3r6Rxt5LLbopU7iaw5aVBQ/s1095/zeppelin-dse-analytics-2-dsefs-results.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;173&quot; data-original-width=&quot;1095&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZjdDFdBYj93H3qwVDMUjUBCBrv3hPjTTgC7RijYl8WZ-faj5RexR4-zq8jPb-0dm1AX4ud6va19UdEJJMb-63hcLdiTJfVzIZTlK2liJuk86NYyqt3r6Rxt5LLbopU7iaw5aVBQ/d/zeppelin-dse-analytics-2-dsefs-results.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
and see that data on DSEFS:
&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEik7JAe3oBxD3Vu0LyGh6AzIpCnQxmhtQdb0_denF5CJfkW662G4SSFhftnEvsGQx5cWtCSyl1pu2zhDPQ1q9r2JTtznPXlktg2Z5N8nYTWMS3hopu9rH-F7705S0wUpzTOObhqBQ/s1009/zeppelin-dse-analytics-2-dsefs-write.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;573&quot; data-original-width=&quot;1009&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEik7JAe3oBxD3Vu0LyGh6AzIpCnQxmhtQdb0_denF5CJfkW662G4SSFhftnEvsGQx5cWtCSyl1pu2zhDPQ1q9r2JTtznPXlktg2Z5N8nYTWMS3hopu9rH-F7705S0wUpzTOObhqBQ/s640/zeppelin-dse-analytics-2-dsefs-write.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orgc7ebf51&quot;&gt;
&lt;h4 id=&quot;orgc7ebf51&quot;&gt;Conclusion&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-5-3&quot;&gt;
&lt;p&gt;
This post shows that it&#39;s quite easy to run Apache Zeppelin on DSE Analytics, either directly on the cluster&#39;s nodes, or outside of the DSE cluster.  For the second option, the setup process could be simplified by packing both DSE &amp;amp; Zeppelin into a Docker image (&lt;a href=&quot;https://gist.github.com/alexott/246e9ab5e50416d83c080f53529cecf6&quot;&gt;example&lt;/a&gt;), and configuring Zeppelin using its &lt;a href=&quot;https://zeppelin.apache.org/docs/0.9.0-preview2/usage/rest_api/configuration.html&quot;&gt;configuration REST API&lt;/a&gt;.
&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/3094407075655152998/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/3094407075655152998' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/3094407075655152998'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/3094407075655152998'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/07/running-apache-zeppelin-on-dse-analytics.html' title='Running Apache Zeppelin on DSE Analytics'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgi1rXv-CW9ho-Q0E1m8ArPbngvS6ZadRwmKnv7RoVQ_ro6tiz3rxrZX1v-boopH1O_f8-P3mxaW7JYIJgeGEL6GwQtaGAQK83Wk6aeK4JRTPCZ1RBNA7aDD7ILi5CxQkr3fDf0gg/s72-c-d/zeppelin-dse-analytics-1-configure.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-1312506369334664265</id><published>2020-07-30T11:59:00.004+02:00</published><updated>2020-07-30T12:10:20.088+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cassandra"/><category scheme="http://www.blogger.com/atom/ns#" term="zeppelin"/><title type='text'>What&#39;s new in Apache Zeppelin&#39;s Cassandra interpreter</title><content type='html'>&lt;p&gt;
The upcoming Zeppelin 0.9 is a very big release for Apache Zeppelin (the 0.9.0-preview2 was just released).  A lot has happened since release of the 0.8.x series - better support for Spark &amp;amp; Flink, new interpreters (Influxdb, KSQL, MongoDB, SPARQL, …), a lot of bug fixes and improvements in the existing interpreters.  In this blog post I want to specifically discuss improvements in the Cassandra interpreter that exists since Zeppelin 0.5.5, released almost 5 years ago.  
&lt;/p&gt;

&lt;p&gt;
The two most notable changes in the new release (already available in the &lt;code&gt;0.9.0-preview2&lt;/code&gt;) are:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Upgrade of the driver to DataStax Java driver 4.x (&lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN-4378&quot;&gt;ZEPPELIN-4378&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;Control of formatting for results of SELECT queries (&lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN-4796&quot;&gt;ZEPPELIN-4796&lt;/a&gt;)&lt;/li&gt;
&lt;/ul&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org1198230&quot;&gt;
&lt;h4 id=&quot;org1198230&quot;&gt;Upgrade to the DataStax Java driver 4.x&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-1&quot;&gt;
&lt;p&gt;
Prior releases of the Cassandra interpreter were based on the open source &lt;a href=&quot;https://github.com/datastax/java-driver/tree/3.x&quot;&gt;DataStax Java Driver for Apache Cassandra 3.x&lt;/a&gt;.  It worked fine with Apache Cassandra, but not always was usable with DataStax Enterprise (DSE), for example, you couldn&#39;t use it with DSE-specific data types, like, &lt;code&gt;Point&lt;/code&gt;, when you get data back as &lt;code&gt;ByteBuffer&lt;/code&gt; instead of &lt;code&gt;Point&lt;/code&gt;:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiVgF_4na0zQkMMAUwIhYc36s4eVJvkcA0IVYhUQ83aygcEm8F7vyVEdN04CR5hHIAQVM1P2KUCmLb36_kl3qZdxxYZ_3SdwakGTu6ZUIbgwcY1Vz5xeLFT8xhXozD3kvO85xvM0w/s1082/zeppelin-driver3-geo-points.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;222&quot; data-original-width=&quot;1082&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiVgF_4na0zQkMMAUwIhYc36s4eVJvkcA0IVYhUQ83aygcEm8F7vyVEdN04CR5hHIAQVM1P2KUCmLb36_kl3qZdxxYZ_3SdwakGTu6ZUIbgwcY1Vz5xeLFT8xhXozD3kvO85xvM0w/d/zeppelin-driver3-geo-points.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
DataStax Java driver 4.0, released in March 2019th, was a complete rewrite of the Cassandra driver to make it more scalable and fault-tolerant.  To achieve these goals, the architecture of the driver has changed significantly, making it binary incompatible with previous versions.  Also since Java driver 4.4.0, released in January 2020th, all DSE-specific functionality is &lt;a href=&quot;https://www.datastax.com/blog/2020/01/better-drivers-for-cassandra&quot;&gt;available in the single (unified) driver&lt;/a&gt;, instead of traditional separation on OSS &amp;amp; DSE drivers.  With release of the unified driver 4, the 3.x series of the driver was put into the maintenance mode, receiving only critical bug-fixes, but no new features.  
&lt;/p&gt;

&lt;p&gt;
To get access to the new features of the driver, internals of Cassandra interpreter were rewritten.  Because of the architectural changes of the new driver, the changes in the interpreter were quite significant.  But in result we&#39;re getting more functionality:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;Access to all improvements and new functions provided by the driver itself - better load balancing policy, fault tolerance, performance, etc.&lt;/li&gt;
&lt;li&gt;Allow to configure all parameters of the Java driver.  In previous versions of interpreter, every configuration option of the driver should be explicitly exposed in the interpreter&#39;s configuration, and addition of the new option required change in the interpreter&#39;s code, and release of the new version together with Zeppelin release.  In the new version of interpreter, we can set any driver configuration option, even if it&#39;s not explicitly exposed by interpreter.  This is possible because of the &lt;a href=&quot;https://docs.datastax.com/en/developer/java-driver/4.7/manual/core/configuration/&quot;&gt;way the new Java driver is configured&lt;/a&gt; - configuration could be specified in the config file, set programmatically, or even via Java system properties.   This flexibility was already demonstrated in the &lt;a href=&quot;https://alexott.blogspot.com/2020/06/working-with-datastax-astra-from-apache.html&quot;&gt;blog post on connecting Zeppelin to the DataStax&#39;s Astra&lt;/a&gt; (Cassandra as a Service)&lt;/li&gt;
&lt;li&gt;Support for DSE-specific features, for example, now it&#39;s possible to execute commands of DSE Search, or work with geospatial data types:&lt;/li&gt;
&lt;/ul&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgCNt01nHY2Rj0VZWV0azUpY54OH3DpIVTJ1GJREXPfpEA7idGS0dG15vt_-EEUWSx1QRJfl7pmchZ5UDDkcGf30cZxAjmHlf67BoALxl3wz03kK_LdVFPWRhWnSQRO3bTdgUiVlA/s1090/zeppelin-driver4-geo-points.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;201&quot; data-original-width=&quot;1090&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgCNt01nHY2Rj0VZWV0azUpY54OH3DpIVTJ1GJREXPfpEA7idGS0dG15vt_-EEUWSx1QRJfl7pmchZ5UDDkcGf30cZxAjmHlf67BoALxl3wz03kK_LdVFPWRhWnSQRO3bTdgUiVlA/d/zeppelin-driver4-geo-points.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Because of the changes in driver itself, there are some breaking changes in interpreter:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;the new driver &lt;a href=&quot;https://docs.datastax.com/en/developer/java-driver/4.7/manual/core/native_protocol/#compatibility-matrix&quot;&gt;supports only Cassandra versions&lt;/a&gt; that implement native protocol V3 and higher (Cassandra 2.1+, and DSE 4.7+).  As result, support for Cassandra 1.2 and 2.0 is dropped (but you shouldn&#39;t use them in 2020th anyway)&lt;/li&gt;
&lt;li&gt;there is only &lt;a href=&quot;https://docs.datastax.com/en/developer/java-driver/4.7/manual/core/retries/&quot;&gt;one retry policy provided by the new driver&lt;/a&gt;, and support for other retry policies (&lt;code&gt;LoggingRetryPolicy&lt;/code&gt;, &lt;code&gt;FallthroughRetryPolicy&lt;/code&gt;, etc.) are removed.  As result of this, support for query parameter &lt;code&gt;@retryPolicy&lt;/code&gt; was dropped, so existing notebooks that are using this parameter need to be modified&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orgef5a268&quot;&gt;
&lt;h4 id=&quot;orgef5a268&quot;&gt;Control of the results&#39; formatting&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-2&quot;&gt;
&lt;p&gt;
The previous version of the interpreter always used the predefined formatting for numbers, and date/time related data types. Also, the content of the collections (maps, sets &amp;amp; lists), tuples, and user-defined types was always formatted using the CQL syntax, with  This wasn&#39;t always flexible, especially for building graphs, or exporting data into a file for importing into external system that may expect data in some specific format.  
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgmjOwoWfoH96kMx4zxIKy0j928fIkGozdhk6LVP9cSblFP7AKsdugCMCPwjyFsaUavDbnVBfDrA7vrj-u22OcIqvD2i7E6mC3aonni0scAlqBkAJ0uc3HiZOpvQVQDHctGBz2wgg/s1218/zeppelin-formatting-old.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;217&quot; data-original-width=&quot;1218&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgmjOwoWfoH96kMx4zxIKy0j928fIkGozdhk6LVP9cSblFP7AKsdugCMCPwjyFsaUavDbnVBfDrA7vrj-u22OcIqvD2i7E6mC3aonni0scAlqBkAJ0uc3HiZOpvQVQDHctGBz2wgg/d/zeppelin-formatting-old.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
In a new interpreter users can control formatting of results - you can configure this on interpreter and even on the cell level.  This includes:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;selection between output in the human-readable or strict CQL format.  In the human-readable format, users can have more control on the formatting, like, specification of precision, formatting of date/time results, etc.&lt;/li&gt;
&lt;li&gt;control of precision for &lt;code&gt;float&lt;/code&gt;, &lt;code&gt;double&lt;/code&gt;, and &lt;code&gt;decimal&lt;/code&gt; types&lt;/li&gt;
&lt;li&gt;specification of locale that will be used for formatting - this affects date/time &amp;amp; numeric types&lt;/li&gt;
&lt;li&gt;specification of format for date/time types for each of &lt;code&gt;date&lt;/code&gt;, &lt;code&gt;time&lt;/code&gt;, and &lt;code&gt;timestamp&lt;/code&gt; types.  You can use any option of &lt;a href=&quot;https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html&quot; target=&quot;_blank&quot;&gt;DateTimeFormatter&lt;/a&gt; class&lt;/li&gt;
&lt;li&gt;specification of timezone for &lt;code&gt;timestamp&lt;/code&gt; type&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
All of this is applied to all data, including the content of collections, tuples, and user-defined types.
&lt;/p&gt;

&lt;p&gt;
Formatting options could be set on the interpreter level by changing new configuration options (see &lt;a href=&quot;https://zeppelin.apache.org/docs/0.9.0-preview2/interpreter/cassandra.html&quot; target=&quot;_blank&quot;&gt;documentation for details&lt;/a&gt;) - if you change them, this will affect all notebooks:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhxNitlBvLqM2VISkX8qOF4FATWylU0l_NVbZwRjAjtZyluQ6DY_ZaZBJjoTmTYEO3Aaa1KJ6rSQrdca2I2lCfhJFYKpG1oxzRXqYDaLGDRDNj2oQ0xhzuZFqsD9NVm17NWnhs6SA/s1335/zeppelin-formatting-config-options.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;305&quot; data-original-width=&quot;1335&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhxNitlBvLqM2VISkX8qOF4FATWylU0l_NVbZwRjAjtZyluQ6DY_ZaZBJjoTmTYEO3Aaa1KJ6rSQrdca2I2lCfhJFYKpG1oxzRXqYDaLGDRDNj2oQ0xhzuZFqsD9NVm17NWnhs6SA/d/zeppelin-formatting-config-options.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;

&lt;p&gt;
With default options, user will get data in human-readable format, like this:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh4-XpNA5RgTfbTG4M81eqQT2z8IcTyPCQSTw7P7jn4P0wuussGD9yMxFBle89tQgPwWcGu3LAJaI_PbHN7KzKBFJZi_gnDp29-BQafAjo42gfKan0OaM3BEO9DV0cujzLMakkCVA/s1226/zeppelin-formatting-new-default.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;216&quot; data-original-width=&quot;1226&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh4-XpNA5RgTfbTG4M81eqQT2z8IcTyPCQSTw7P7jn4P0wuussGD9yMxFBle89tQgPwWcGu3LAJaI_PbHN7KzKBFJZi_gnDp29-BQafAjo42gfKan0OaM3BEO9DV0cujzLMakkCVA/d/zeppelin-formatting-new-default.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
But sometimes it&#39;s useful to change formatting only in specific cells.  This is now possible by specifying options in the list after the interpreter name, like &lt;code&gt;%cassandra(option=value, ...)&lt;/code&gt; (please note, that if option includes &lt;code&gt;=&lt;/code&gt; or &lt;code&gt;,&lt;/code&gt; characters, it should be put into double quotes, or escaped with &lt;code&gt;\&lt;/code&gt;). There are multiple options available, that are described in the documentation(TODO: link) and built-in help. For example, we can change formatting to CQL:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhMiHS0LLtVMEs7tgqn47NgTyCr0SW7QTLT91g3Jyp5mCovKemnFElzXJAWyHhWkuIfhVCcshwQfBsMOos9oWrk8RVDArYw-uWfKP-zmcMBnJKtMaMw_CvWsTVmz4qaSZz4BrbLBw/s1227/zeppelin-formatting-new-cql.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;197&quot; data-original-width=&quot;1227&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhMiHS0LLtVMEs7tgqn47NgTyCr0SW7QTLT91g3Jyp5mCovKemnFElzXJAWyHhWkuIfhVCcshwQfBsMOos9oWrk8RVDArYw-uWfKP-zmcMBnJKtMaMw_CvWsTVmz4qaSZz4BrbLBw/d/zeppelin-formatting-new-cql.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Or we can multiple options at the same time - locale (see that it affects formatting of numbers and date/time), timezone, format of timestamp, date, etc.:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhrwgNaA0wPMSvtRnHT_X9hk4FFO3GERshUV637iE64zzmmd7alPcNUHExVzhnzXiRMOBRzqF74OHLG9YzllYsrhGRjTVyqpCNu1ZpJ5hO26HwxdqFglVKWQi1J5_08rWdwAiIHVQ/s1345/zeppelin-formatting-new-with-options.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;250&quot; data-original-width=&quot;1345&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhrwgNaA0wPMSvtRnHT_X9hk4FFO3GERshUV637iE64zzmmd7alPcNUHExVzhnzXiRMOBRzqF74OHLG9YzllYsrhGRjTVyqpCNu1ZpJ5hO26HwxdqFglVKWQi1J5_08rWdwAiIHVQ/d/zeppelin-formatting-new-with-options.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org4a27d51&quot;&gt;
&lt;h4 id=&quot;org4a27d51&quot;&gt;Other changes&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-3&quot;&gt;
&lt;p&gt;
There are also smaller changes available in the new release - they are making the interpreter more stable, or add a new functionality.  This includes:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;(&lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN-4444&quot;&gt;ZEPPELIN-4444&lt;/a&gt;) explicitly check for schema disagreement when executing the DDL statements (&lt;code&gt;CREATE/ALTER/DROP&lt;/code&gt;).  This is very important for stability of the Cassandra cluster, especially when executing many of them from the same cell.  Because Cassandra is a distributed system, they could be executed on the different nodes in almost the same time, and such uncoordinated execution may lead to a state of the cluster called &quot;schema disagreement&quot; when different nodes have different versions of the database schema.  Fixing this state usually &lt;a href=&quot;https://support.datastax.com/hc/en-us/articles/360001375763-Handling-schema-disagreements-and-Schema-version-mismatch-detected-on-node-restart&quot;&gt;requires manual intervention of database administrators&lt;/a&gt;, and restarting of the affected nodes&lt;/li&gt;
&lt;li&gt;(&lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN-4393&quot;&gt;ZEPPELIN-4393&lt;/a&gt;) added support for &lt;code&gt;--&lt;/code&gt; comment style, in addition to already supported &lt;span style=&quot;font-family: courier;&quot;&gt;&lt;code&gt;//&lt;/code&gt;&lt;/span&gt; and &lt;span style=&quot;font-family: courier;&quot;&gt;&lt;code&gt;/* .. */&lt;/code&gt;&lt;/span&gt; styles&lt;/li&gt;
&lt;li&gt;(&lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN-4756&quot;&gt;ZEPPELIN-4756&lt;/a&gt;) make &quot;No results&quot; messages foldable &amp;amp; folded by default.  In previous versions, when we didn&#39;t get any results from Cassandra, for example, by executing &lt;code&gt;INSERT/DELETE/UPDATE&lt;/code&gt;, or DDL queries, interpreter output a table with statement itself, and information about execution (what hosts were used for execution, etc.).  This table occupied quite significant space on the screen, but usually didn&#39;t bring much useful information for a user.  In the new version, this information is still produced, but it&#39;s folded, so it doesn&#39;t occupy screen space, and still available if necessary.&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org3fad7a0&quot;&gt;
&lt;h4 id=&quot;org3fad7a0&quot;&gt;Conclusion&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-4&quot;&gt;
&lt;p&gt;
I hope that all described changes will make use of the Cassandra from Zeppelin easier.  If you have ideas for a new functionality in Cassandra interpreter, or found a bug, feel free to create an issue at &lt;a href=&quot;https://issues.apache.org/jira/browse/ZEPPELIN&quot;&gt;Apache Zeppelin&#39;s Jira&lt;/a&gt;, or drop an email to &lt;a href=&quot;https://zeppelin.apache.org/community.html&quot;&gt;Zeppelin user mailing list&lt;/a&gt;.
&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/1312506369334664265/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/1312506369334664265' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1312506369334664265'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/1312506369334664265'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/07/new-functionality-of-cassandra.html' title='What&#39;s new in Apache Zeppelin&#39;s Cassandra interpreter'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiVgF_4na0zQkMMAUwIhYc36s4eVJvkcA0IVYhUQ83aygcEm8F7vyVEdN04CR5hHIAQVM1P2KUCmLb36_kl3qZdxxYZ_3SdwakGTu6ZUIbgwcY1Vz5xeLFT8xhXozD3kvO85xvM0w/s72-c-d/zeppelin-driver3-geo-points.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-234608211915646986</id><published>2020-07-28T18:00:00.005+02:00</published><updated>2020-10-16T10:26:04.273+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="astra"/><category scheme="http://www.blogger.com/atom/ns#" term="cassandra"/><category scheme="http://www.blogger.com/atom/ns#" term="databricks"/><category scheme="http://www.blogger.com/atom/ns#" term="datastax"/><category scheme="http://www.blogger.com/atom/ns#" term="spark"/><title type='text'>Working with DataStax Astra from Databricks platform</title><content type='html'>&lt;p&gt;
One of the notable changes in the &lt;a href=&quot;https://www.datastax.com/blog/2020/05/advanced-apache-cassandra-analytics-now-open-all&quot;&gt;Spark Cassandra Connector (SCC) 2.5.0&lt;/a&gt; is the support for &lt;a href=&quot;https://astra.datastax.com/&quot;&gt;Astra&lt;/a&gt; - DataStax&#39;s Cassandra as a Service offering.  Having managed Cassandra makes it very easy to start development of the applications - you can create a new database in a couple of minutes.  &lt;a href=&quot;https://databricks.com/&quot;&gt;Databricks&lt;/a&gt; is also well known for its Spark-based unified cloud data processing platform.  Both Databricks &amp;amp; DataStax offer the free tier, and this combination is an ideal ground for prototypes.  This short blog describes how to work with Astra from the Databricks, using free tiers in both cases. 
&lt;/p&gt;

&lt;p&gt;
To get access to Astra from Databricks, we need the following:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;running instance of Astra database - if you don&#39;t have it, it&#39;s easy to create - just &lt;a href=&quot;https://astra.datastax.com/&quot; target=&quot;_blank&quot;&gt;login to Astra&lt;/a&gt;, and press &quot;Create New Database&quot;&lt;br /&gt;&lt;/li&gt;
&lt;li&gt;credentials (username &amp;amp; password) specified when creating database&lt;/li&gt;
&lt;li&gt;secure connect bundle that will be used to establish connection to Astra - this bundle could be downloaded from the database&#39;s main page, and need to be uploaded to DBFS (Databricks File System), so it will be available to Spark&lt;/li&gt;
&lt;li&gt;Spark cluster configured to use secure connect bundle, together with other parameters - credentials, etc.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
First we need to upload the secure connect bundle to DBFS.  Easiest way to do it is to go to &quot;Data&quot;, click the &quot;Add Data&quot; button, and use the &quot;Upload File&quot; form.  After the file is uploaded, remember the path to the uploaded file (like, &lt;code&gt;/FileStore/tables/secure_connect_aott.zip&lt;/code&gt;) - we&#39;ll need it on the next steps.
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhvhwvEXkWFIjT5Fy13O7Jy9Na6ZFkrAKqs-yOO4qDFIvqTpVixu158hYoQZrrEeIl2Hn8CVIq3Uq8cbErfBnkaWSgR_17I7PU0jWfdLnQBmvJYFlkpRJXNisX2QXhiaXeFBTwTKA/s721/astra-databricks-upload-bundle.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;497&quot; data-original-width=&quot;721&quot; height=&quot;431&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhvhwvEXkWFIjT5Fy13O7Jy9Na6ZFkrAKqs-yOO4qDFIvqTpVixu158hYoQZrrEeIl2Hn8CVIq3Uq8cbErfBnkaWSgR_17I7PU0jWfdLnQBmvJYFlkpRJXNisX2QXhiaXeFBTwTKA/w625-h431/astra-databricks-upload-bundle.png&quot; width=&quot;625&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
Then we need to create a Spark cluster.  Go to &quot;Clusters&quot; and click &quot;Create Cluster&quot;. Select runtime, either Spark 2.4, or Spark 3.0 - depending on the version selected we&#39;ll need to use different versions of Spark Cassandra Connector.  Click &quot;Spark&quot; link and enter configuration parameters there:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;&lt;code&gt;spark.cassandra.auth.username&lt;/code&gt; - user name to connect to database (&lt;code&gt;test&lt;/code&gt; in my case)&lt;/li&gt;
&lt;li&gt;&lt;code&gt;spark.cassandra.auth.password&lt;/code&gt; - password for user (&lt;code&gt;123456&lt;/code&gt; in my case)&lt;/li&gt;
&lt;li&gt;&lt;code&gt;spark.cassandra.connection.config.cloud.path&lt;/code&gt; - path to the uploaded file with secure connect bundle (&lt;code&gt;dbfs:/FileStore/tables/secure_connect_aott.zip&lt;/code&gt;)&lt;/li&gt;
&lt;li&gt;&lt;code&gt;spark.dse.continuousPagingEnabled&lt;/code&gt; with value &lt;code&gt;false&lt;/code&gt; - this is a workaround for &lt;a href=&quot;https://datastax-oss.atlassian.net/browse/SPARKC-602&quot;&gt;SPARKC-602&lt;/a&gt; that we need to apply right now to avoid errors when reading data from Astra&lt;/li&gt;
&lt;/ul&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiDjWTi_JrJTfe16M0wKdfsxP0-j03yH_Agwx5zpAIRkXizG5pL7PE3hx-rHr4DBlYIYtARZ6znXQayeNUutOd8Rrah5kb65R3S7kK8iZzQ4WNe6oSIFKb1QjZy88HSwcoQlLv0_A/s847/astra-databricks-create-cluster.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;607&quot; data-original-width=&quot;847&quot; height=&quot;458&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiDjWTi_JrJTfe16M0wKdfsxP0-j03yH_Agwx5zpAIRkXizG5pL7PE3hx-rHr4DBlYIYtARZ6znXQayeNUutOd8Rrah5kb65R3S7kK8iZzQ4WNe6oSIFKb1QjZy88HSwcoQlLv0_A/w640-h458/astra-databricks-create-cluster.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
After entering all this data, press &quot;Create cluster&quot; - this will take you to the list of your running clusters.  If you point a mouse to the instance that is creating right now, you can see several links, like &quot;Libraries / Spark UI / Logs&quot; - we need to select &quot;Libraries&quot; to add Spark Cassandra Connector to a cluster.   In the opened page, click &quot;Install New&quot; - this will open the dialog for addition of the library.  Select the &quot;Maven&quot; tab.   Because we have dependency conflict between SCC and Databricks runtime we must not use the &lt;code&gt;spark-cassandra-connector&lt;/code&gt; artifact, but the assembly version of it: &lt;code&gt;spark-cassandra-connector-assembly&lt;/code&gt; (see &lt;a href=&quot;https://datastax-oss.atlassian.net/browse/SPARKC-601&quot;&gt;SPARKC-601&lt;/a&gt; for details).   For runtime version 6.x we need to use &lt;code&gt;com.datastax.spark:spark-cassandra-connector-assembly_2.11:2.5.1&lt;/code&gt;, and for runtime 7.0 we need to take &lt;code&gt;com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.0.0-beta&lt;/code&gt; (or released version when it&#39;s done).  Click &quot;Install&quot; to add the library to a cluster. (Please note that to use &lt;code&gt;SparkCassandraExtensions&lt;/code&gt;, for DirectJoin, for example, you need to have a &lt;a href=&quot;https://docs.databricks.com/clusters/init-scripts.html&quot;&gt;cluster init script&lt;/a&gt; in place, that should copy the assembly before the driver &amp;amp; executor will start...)&lt;br /&gt;&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjOZ7H7FwkBos4LoXJ6tOAXOWiDbuR_FKNL6-bAbQbXkY82smEH1cQpjRCnaIKsKLvxtJOzhWVixA4dBSeSNV_IO9mi5sUhlrpq_I-k22gDJoawUW0eUSTeDeDiPoA3r5MHFPnISg/s1122/astra-databricks-add-library.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;515&quot; data-original-width=&quot;1122&quot; height=&quot;294&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjOZ7H7FwkBos4LoXJ6tOAXOWiDbuR_FKNL6-bAbQbXkY82smEH1cQpjRCnaIKsKLvxtJOzhWVixA4dBSeSNV_IO9mi5sUhlrpq_I-k22gDJoawUW0eUSTeDeDiPoA3r5MHFPnISg/w640-h294/astra-databricks-add-library.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
To make this blog post self contained and not dependent on the previously created tables &amp;amp; loaded data, let&#39;s generate test data, create a table using SCC, and write data into that table:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; org.apache.spark.sql.cassandra.&lt;span style=&quot;color: #a020f0;&quot;&gt;_&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; com.datastax.spark.connector.&lt;span style=&quot;color: #a020f0;&quot;&gt;_&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; com.datastax.spark.connector.cql.&lt;span style=&quot;color: darkcyan;&quot;&gt;ClusteringColumn&lt;/span&gt;

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;newData&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;1000&lt;/span&gt;)
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;), $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;), 
          $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;), $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;string&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;str&quot;&lt;/span&gt;))

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;ksName&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;tableName&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;newdata&quot;&lt;/span&gt;
newData.createCassandraTableEx(ksName, tableName, 
                               partitionKeyColumns &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;Seq&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;), 
                               clusteringKeyColumns &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;Seq&lt;/span&gt;(
                                  (&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;ClusteringColumn&lt;/span&gt;.&lt;span style=&quot;color: darkcyan;&quot;&gt;Ascending&lt;/span&gt;), 
                                  (&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;ClusteringColumn&lt;/span&gt;.&lt;span style=&quot;color: darkcyan;&quot;&gt;Descending&lt;/span&gt;)), 
                               ifNotExists &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;true&lt;/span&gt;)
newData.write.cassandraFormat(tableName, ksName).mode(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;append&quot;&lt;/span&gt;).save
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
This code will generate a dataframe with 999 rows, create a table with following structure, and write data into it:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-cql&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;CREATE TABLE&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;test.newdata&lt;/span&gt; (
    pk &lt;span style=&quot;color: forestgreen;&quot;&gt;bigint&lt;/span&gt;,
    c1 &lt;span style=&quot;color: forestgreen;&quot;&gt;int&lt;/span&gt;,
    c2 &lt;span style=&quot;color: forestgreen;&quot;&gt;int&lt;/span&gt;,
    str &lt;span style=&quot;color: forestgreen;&quot;&gt;text&lt;/span&gt;,
    &lt;span style=&quot;color: #a020f0;&quot;&gt;PRIMARY KEY&lt;/span&gt; (pk, c1, c2)
) &lt;span style=&quot;color: #a020f0;&quot;&gt;WITH CLUSTERING ORDER BY&lt;/span&gt; (c1 &lt;span style=&quot;color: #a020f0;&quot;&gt;ASC&lt;/span&gt;, c2 &lt;span style=&quot;color: #a020f0;&quot;&gt;DESC&lt;/span&gt;);
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
To check that data is written correctly, let read them into another dataframe, print its schema, and count number of rows:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;data&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.read.cassandraFormat(tableName, ksName).load
data.printSchema
data.count
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
as expected, this will print schema as:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;root
 |-- pk: long (nullable = false)
 |-- c1: integer (nullable = false)
 |-- c2: integer (nullable = false)
 |-- str: string (nullable = true)
&lt;/pre&gt;

&lt;p&gt;
and output the number of rows, as expected it&#39;s 999.  
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgQ6QwpIUF3TswLfAeAnr5ABPa6r0nMAnD1LJJag3duro0vV4ikZ5vuc4KegcOFsR13WNjakqY7Nd2KOcQAiy4jKivQIRX1pDZ0kwA8cnvOpFanomuIsAZMRNFIsNHctn5VZTBEUQ/s969/astra-databricks-notebook.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;938&quot; data-original-width=&quot;969&quot; height=&quot;620&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgQ6QwpIUF3TswLfAeAnr5ABPa6r0nMAnD1LJJag3duro0vV4ikZ5vuc4KegcOFsR13WNjakqY7Nd2KOcQAiy4jKivQIRX1pDZ0kwA8cnvOpFanomuIsAZMRNFIsNHctn5VZTBEUQ/w640-h620/astra-databricks-notebook.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
That&#39;s all!  You can continue to use Spark Cassandra Connector to work with data in Astra using either Dataframe, or RDD APIs - all functionality is the same, including &lt;a href=&quot;https://alexott.blogspot.com/2020/07/spark-effective-joins-with-cassandra.html&quot;&gt;joins with Cassandra&lt;/a&gt;, writing streaming data into it, etc.  See &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/tree/b2.5/doc&quot;&gt;Spark Cassandra Connector documentation&lt;/a&gt; for more information.
&lt;/p&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/234608211915646986/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/234608211915646986' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/234608211915646986'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/234608211915646986'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/07/working-with-datastax-astra-from.html' title='Working with DataStax Astra from Databricks platform'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhvhwvEXkWFIjT5Fy13O7Jy9Na6ZFkrAKqs-yOO4qDFIvqTpVixu158hYoQZrrEeIl2Hn8CVIq3Uq8cbErfBnkaWSgR_17I7PU0jWfdLnQBmvJYFlkpRJXNisX2QXhiaXeFBTwTKA/s72-w625-h431-c/astra-databricks-upload-bundle.png" height="72" width="72"/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-29732542745204182</id><published>2020-07-27T15:17:00.005+02:00</published><updated>2020-08-24T22:48:44.554+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="cassandra"/><category scheme="http://www.blogger.com/atom/ns#" term="spark"/><title type='text'>Spark &amp; efficient joins with Cassandra</title><content type='html'>&lt;p&gt;
In modern data processing, especially when handling streaming data, quite often there is a need for enrichment of data coming from external sources.  High-level diagram for such data processing usually looks as following:
&lt;/p&gt;

&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjY6cF3M0n8ZteH4EextITAHEN8kGvn6rfEkRvoL3zy-4oguOIQh-o3DzZZJnKtu1eLMcBOI2JqGzSJOY9ZGB8DTVC7ygHNdxyzaT04KDVC5CY3f9Pq6L7au7ibs_4DQfj40H6O8g/s579/data-enrichment.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;176&quot; data-original-width=&quot;579&quot; height=&quot;189&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjY6cF3M0n8ZteH4EextITAHEN8kGvn6rfEkRvoL3zy-4oguOIQh-o3DzZZJnKtu1eLMcBOI2JqGzSJOY9ZGB8DTVC7ygHNdxyzaT04KDVC5CY3f9Pq6L7au7ibs_4DQfj40H6O8g/w625-h189/data-enrichment.png&quot; width=&quot;625&quot; /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;&lt;/p&gt;

&lt;p&gt;
For effective enrichment of the data, the database that holds that additional information should provide low-latency, high throughput access to the data.  And Apache Cassandra with its very fast reads by primary key, is the ideal candidate for storing the data that will be used for enrichment (maybe in addition to being a storage of processed data).
&lt;/p&gt;

&lt;p&gt;
Apache Spark is often a base for implementation of the data processing pipelines, for both batch &amp;amp; streaming data.  And it has very good support for Cassandra provided by the &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector&quot;&gt;Spark Cassandra Connector (SCC)&lt;/a&gt;.  Connector provides access to Cassandra via both RDD &amp;amp; Dataframe APIs, and recently released SCC 2.5 &lt;a href=&quot;https://www.datastax.com/blog/2020/05/advanced-apache-cassandra-analytics-now-open-all&quot;&gt;added a lot of the new functionality&lt;/a&gt; that earlier was available only as a part of &lt;a href=&quot;https://www.datastax.com/products/datastax-enterprise&quot;&gt;DataStax Enterprise&lt;/a&gt;, including support for effective joins with Cassandra for dataframes.
&lt;/p&gt;

&lt;p&gt;
Spark Cassandra Connector has optimizations for executing join of dataframe or RDD with data in Cassandra - data that is used to join are converted into requests to individual partitions or rows that are executed in parallel, avoiding the full scan of the data in Cassandra (there are settings that define the thresholds when SCC will decide to do a full scan vs individual requests). Russell Spitzer has a &lt;a href=&quot;http://www.russellspitzer.com/2018/05/23/DSEDirectJoin/&quot;&gt;very good blog post&lt;/a&gt; about joins in dataframes, including information about its performance.  SCC allows to perform either inner join, or left join between RDD/dataframe and Cassandra.  One of the useful things when performing joins is that it reflects changes done in Cassandra, so you can always have access to the latest data. You can also use caching on data in Cassandra to avoid hitting Cassandra on every join, and periodically refresh the cached version to pull the latest changes.
&lt;/p&gt;

&lt;p&gt;
We&#39;ll use following table definition and data in the most of examples shown below:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-cql&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;create table&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;test.jtest1&lt;/span&gt; (
  pk &lt;span style=&quot;color: forestgreen;&quot;&gt;int&lt;/span&gt;,
  c1 &lt;span style=&quot;color: forestgreen;&quot;&gt;int&lt;/span&gt;,
  c2 &lt;span style=&quot;color: forestgreen;&quot;&gt;int&lt;/span&gt;,
  v  &lt;span style=&quot;color: forestgreen;&quot;&gt;text&lt;/span&gt;,
  &lt;span style=&quot;color: #a020f0;&quot;&gt;primary key&lt;/span&gt;(pk, c1, c2));
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (1, 1, 1, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t1-1-1&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (1, 1, 2, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t1-1-2&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (1, 2, 1, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t1-2-1&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (1, 2, 2, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t1-2-2&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (2, 1, 1, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t2-1-1&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (2, 1, 2, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t2-1-2&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (2, 2, 1, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t2-2-1&#39;&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.jtest1(pk, c1, c2, v) &lt;span style=&quot;color: #a020f0;&quot;&gt;values&lt;/span&gt; (2, 2, 2, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;t2-2-2&#39;&lt;/span&gt;);
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
The join condition could be on:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;full partition key (&lt;code&gt;pk&lt;/code&gt; column) - in this case, SCC will pull all rows from that partition and create N rows for each input row&lt;/li&gt;
&lt;li&gt;partial primary key with all preceding clustering columns should be specified, for example (&lt;code&gt;pk&lt;/code&gt; + &lt;code&gt;c1&lt;/code&gt; columns) - SCC will pull all rows that are matching to the given range query, and create so many rows for each input row&lt;/li&gt;
&lt;li&gt;full primary key (&lt;code&gt;pk&lt;/code&gt; + &lt;code&gt;c1&lt;/code&gt; + &lt;code&gt;c2&lt;/code&gt;) - in this case SCC will pull only one row, if it exists, and use that data for joining&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
The join isn&#39;t supported on following:
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;partial partition key in case of composite partition key&lt;/li&gt;
&lt;li&gt;on the clustering columns that are not preceded by previous clustering columns, for example, &lt;code&gt;pk&lt;/code&gt; + &lt;code&gt;c2&lt;/code&gt; without &lt;code&gt;c1&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;other join types, like, &lt;code&gt;right&lt;/code&gt;, or &lt;code&gt;full&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;
In such cases, depending on API, SCC either throws an error, or will fallback to the performing full scan of the Cassandra table and execution of the join on the Spark side.
&lt;/p&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orgcb57400&quot;&gt;
&lt;h4 id=&quot;orgcb57400&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;&lt;/span&gt;Joins in Dataframe API&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-1&quot;&gt;
&lt;p&gt;
Let&#39;s start with the &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/14_data_frames.md&quot;&gt;Dataframe API&lt;/a&gt; that is recommended to use in modern Spark.  Support for effective joins of dataframes with data in Cassandra for a long time was only available in DSE Analytics - commercial distribution of Cassandra and Spark developed by DataStax, and open source version of SCC had support for joins only in RDD API.  But with release of &lt;a href=&quot;https://www.datastax.com/blog/2020/05/advanced-apache-cassandra-analytics-now-open-all&quot;&gt;SCC 2.5&lt;/a&gt;, join of dataframes also became available for all users of the open source version of SCC.
&lt;/p&gt;

&lt;p&gt;
Please note that this functionality is not enabled by default (together with some other, like, support for &lt;code&gt;ttl&lt;/code&gt; and &lt;code&gt;writetime&lt;/code&gt; functions).  You need to &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/14_data_frames.md#special-cassandra-catalyst-rules-since-scc-25&quot;&gt;enable special Catalyst rules&lt;/a&gt; by setting configuration parameter &lt;code&gt;spark.sql.extensions&lt;/code&gt; to a value &lt;code&gt;com.datastax.spark.connector.CassandraSparkExtensions&lt;/code&gt; when starting Spark shell, or submitting a job.  Something like this:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-shell&quot;&gt;bin/spark-shell --packages com.datastax.spark:spark-cassandra-connector_2.11:2.5.1 &lt;span style=&quot;color: #8b2252;&quot;&gt;\&lt;/span&gt;
   --conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
If you forget to do this, SCC won&#39;t optimize your join, and it will be performed as usual - by reading all data from Cassandra, and executing join in Spark (with shuffle!).  You can always check that this optimization is applied by running &lt;code&gt;dataframe.explain&lt;/code&gt;, and looking for a string &quot;Cassandra Direct Join&quot; in the physical plan - we&#39;ll see that in the examples below (if you&#39;re running code on the DSE Analytics, it will be &quot;DSE Direct Join&quot;).
&lt;/p&gt;

&lt;p&gt;
Let&#39;s look at specific examples of performing joins of dataframe with data in Cassandra.  We&#39;re using the &lt;code&gt;test.jtest1&lt;/code&gt; defined above to demonstrate the behaviour when we&#39;re using only partition key, and complete or partial primary key.  All dataframe examples have following code in common:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; spark.implicits.&lt;span style=&quot;color: #a020f0;&quot;&gt;_&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; org.apache.spark.sql.cassandra.&lt;span style=&quot;color: #a020f0;&quot;&gt;_&lt;/span&gt;

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;cassdata&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.read.cassandraFormat(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;).load
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
We&#39;re starting with a partition key only. For that we generate the dataframe with one column with name &lt;code&gt;id&lt;/code&gt; and values from one to four:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
After its execution, we can check that SCC optimized this join by executing &lt;code&gt;explain&lt;/code&gt;:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
Cassandra Direct Join [pk = id#2] test.jtest1 - Reading (pk, c1, c2, v) Pushed {}
+- *(1) Project [cast(id#0L as int) AS id#2]
   +- *(1) Range (1, 5, step=1, splits=8)
&lt;/pre&gt;

&lt;p&gt;
and we check that we have correct data pulled from Cassandra.  We can see that SCC pulled all rows from partitions &lt;code&gt;1&lt;/code&gt; and &lt;code&gt;2&lt;/code&gt; and converted them into individual rows in the resulting dataframe:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.count
res1: Long = 8

scala&amp;gt; joined.show
+---+---+---+---+------+
| id| pk| c1| c2|     v|
+---+---+---+---+------+
|  1|  1|  1|  1|t1-1-1|
|  1|  1|  1|  2|t1-1-2|
|  1|  1|  2|  1|t1-2-1|
|  1|  1|  2|  2|t1-2-2|
|  2|  2|  1|  1|t2-1-1|
|  2|  2|  1|  2|t2-1-2|
|  2|  2|  2|  1|t2-2-1|
|  2|  2|  2|  2|t2-2-2|
+---+---+---+---+------+
&lt;/pre&gt;

&lt;p&gt;
Handling of the partial primary key is similar - we&#39;re generating similar dataframe but with 2 columns, and join it:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;, $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
   &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
Again we see that SCC optimized that query:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
Cassandra Direct Join [pk = id#30, c1 = cc1#32] test.jtest1 - Reading (pk, c1, c2, v) Pushed {}
+- *(1) Project [cast(id#28L as int) AS id#30, cast(id#28L as int) AS cc1#32]
   +- *(1) Range (1, 5, step=1, splits=8)

scala&amp;gt; joined.count
res8: Long = 4

scala&amp;gt; joined.show
+---+---+---+---+---+------+
| id|cc1| pk| c1| c2|     v|
+---+---+---+---+---+------+
|  1|  1|  1|  1|  1|t1-1-1|
|  1|  1|  1|  1|  2|t1-1-2|
|  2|  2|  2|  2|  1|t2-2-1|
|  2|  2|  2|  2|  2|t2-2-2|
+---+---+---+---+---+------+
&lt;/pre&gt;

&lt;p&gt;
And with full primary key behaviour is the same:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;, $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;), $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
   &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;) 
   &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
We have one-to-one correspondence of rows from the generated dataframe with row in the Cassandra:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
Cassandra Direct Join [pk = id#318, c1 = cc1#320, c2 = cc2#321] test.jtest1 - Reading (pk, c1, c2, v) Pushed {}
+- *(1) Project [cast(id#316L as int) AS id#318, cast(id#316L as int) AS cc1#320, cast(id#316L as int) AS cc2#321]
   +- *(1) Range (1, 5, step=1, splits=8)

scala&amp;gt; joined.count
res13: Long = 2

scala&amp;gt; joined.show
+---+---+---+---+---+---+------+
| id|cc1|cc2| pk| c1| c2|     v|
+---+---+---+---+---+---+------+
|  1|  1|  1|  1|  1|  1|t1-1-1|
|  2|  2|  2|  2|  2|  2|t2-2-2|
+---+---+---+---+---+---+------+
&lt;/pre&gt;

&lt;p&gt;
Left join isn&#39;t much different - we only need explicitly specify it with &quot;left&quot; or &quot;leftouter&quot; argument:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;, $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;), $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
       &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;) 
       &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;), 
   &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;left&quot;&lt;/span&gt;)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
and again, we see that SCC optimized the query.  The only difference is that it retains the rows for which we didn&#39;t find rows in Cassandra:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
Cassandra Direct Join [pk = id#349, c1 = cc1#351, c2 = cc2#352] test.jtest1 - Reading (pk, c1, c2, v) Pushed {}
+- *(1) Project [cast(id#347L as int) AS id#349, cast(id#347L as int) AS cc1#351, cast(id#347L as int) AS cc2#352]
   +- *(1) Range (1, 5, step=1, splits=8)

scala&amp;gt; joined.count
res5: Long = 4

scala&amp;gt; joined.show
+---+---+---+----+----+----+------+
| id|cc1|cc2|  pk|  c1|  c2|     v|
+---+---+---+----+----+----+------+
|  1|  1|  1|   1|   1|   1|t1-1-1|
|  2|  2|  2|   2|   2|   2|t2-2-2|
|  3|  3|  3|null|null|null|  null|
|  4|  4|  4|null|null|null|  null|
+---+---+---+----+----+----+------+
&lt;/pre&gt;

&lt;p&gt;
But if we try to perform right or full join:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
       &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc1&quot;&lt;/span&gt;) 
       &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;), 
   &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;right&quot;&lt;/span&gt;)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
then we&#39;ll see that it&#39;s executed by reading the data from the whole table, and performing join on the Spark level (this is example for &quot;right&quot; join, plan for &quot;full&quot; join looks slightly different):
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
*(2) BroadcastHashJoin [id#56, cc1#58, cc2#59], [pk#4, c1#5, c2#6], RightOuter, BuildLeft
:- BroadcastExchange HashedRelationBroadcastMode(List(input[0, int, false], input[1, int, false], input[2, int, false]))
:  +- *(1) Project [cast(id#54L as int) AS id#56, cast(id#54L as int) AS cc1#58, cast(id#54L as int) AS cc2#59]
:     +- *(1) Range (1, 5, step=1, splits=8)
+- *(2) Scan org.apache.spark.sql.cassandra.CassandraSourceRelation [pk#4,c1#5,c2#6,v#7] PushedFilters: [], ReadSchema: struct&amp;lt;pk:int,c1:int,c2:int,v:string&amp;gt;

scala&amp;gt; joined.show
+----+----+----+---+---+---+------+
|  id| cc1| cc2| pk| c1| c2|     v|
+----+----+----+---+---+---+------+
|   1|   1|   1|  1|  1|  1|t1-1-1|
|null|null|null|  1|  1|  2|t1-1-2|
|null|null|null|  1|  2|  1|t1-2-1|
|null|null|null|  1|  2|  2|t1-2-2|
|null|null|null|  2|  1|  1|t2-1-1|
|null|null|null|  2|  1|  2|t2-1-2|
|null|null|null|  2|  2|  1|t2-2-1|
|   2|   2|   2|  2|  2|  2|t2-2-2|
+----+----+----+---+---+---+------+
&lt;/pre&gt;

&lt;p&gt;
As it was mentioned above, in case of the partial primary key, all preceding clustering columns need to be specified in joining condition as well.  If we don&#39;t do that, like in this example that joins on partition key &amp;amp; second clustering column:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;, $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
  &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
then we&#39;ll get an error saying that we can&#39;t do that (please note that error will be thrown only during reading of the data, not when you&#39;re declaring the join):
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.show
java.lang.IllegalArgumentException: Can&#39;t pushdown join on column ColumnDef(c2,ClusteringColumn(1,ASC),IntType) without also specifying [ Set(ColumnDef(c1,ClusteringColumn(0,ASC),IntType)) ]
  at com.datastax.spark.connector.rdd.AbstractCassandraJoin$class.checkValidJoin(AbstractCassandraJoin.scala:114)
  at com.datastax.spark.connector.rdd.CassandraJoinRDD.checkValidJoin(CassandraJoinRDD.scala:26)
  at com.datastax.spark.connector.rdd.AbstractCassandraJoin$class.getPartitions(AbstractCassandraJoin.scala:210)
  at com.datastax.spark.connector.rdd.CassandraJoinRDD.getPartitions(CassandraJoinRDD.scala:26)
&lt;/pre&gt;

&lt;p&gt;
if you still want to do it, then you need to set &lt;code&gt;directJoinSetting&lt;/code&gt; to &lt;code&gt;off&lt;/code&gt; when reading data, like this:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;cassdata&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.read.cassandraFormat(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;)
  .option(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;directJoinSetting&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;off&quot;&lt;/span&gt;).load
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
  .select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;, $&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.join(cassdata, cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;) 
  &amp;amp;&amp;amp; cassdata(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;) === toJoin(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;cc2&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
and this will force SCC to perform full table scan:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
*(2) BroadcastHashJoin [id#195, cc2#197], [pk#185, c2#187], Inner, BuildLeft
:- BroadcastExchange HashedRelationBroadcastMode(List((shiftleft(cast(input[0, int, false] as bigint), 32) | (cast(input[1, int, false] as bigint) &amp;amp; 4294967295))))
:  +- *(1) Project [cast(id#193L as int) AS id#195, cast(id#193L as int) AS cc2#197]
:     +- *(1) Range (1, 5, step=1, splits=8)
+- *(2) Scan org.apache.spark.sql.cassandra.CassandraSourceRelation [pk#185,c1#186,c2#187,v#188] PushedFilters: [], ReadSchema: struct&amp;lt;pk:int,c1:int,c2:int,v:string&amp;gt;

scala&amp;gt; joined.show
+---+---+---+---+---+------+
| id|cc2| pk| c1| c2|     v|
+---+---+---+---+---+------+
|  1|  1|  1|  1|  1|t1-1-1|
|  1|  1|  1|  2|  1|t1-2-1|
|  2|  2|  2|  1|  2|t2-1-2|
|  2|  2|  2|  2|  2|t2-2-2|
+---+---+---+---+---+------+
&lt;/pre&gt;

&lt;p&gt;
Theoretically direct join should also work for Spark SQL, like this:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.range(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).select($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;.cast(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;int&quot;&lt;/span&gt;).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;id&quot;&lt;/span&gt;))
toJoin.createOrReplaceTempView(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;tojoin&quot;&lt;/span&gt;)

spark.sql(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;&quot;&quot;CREATE OR REPLACE TEMPORARY VIEW cassdata&lt;/span&gt;
&lt;span style=&quot;color: #8b2252;&quot;&gt;  USING org.apache.spark.sql.cassandra&lt;/span&gt;
&lt;span style=&quot;color: #8b2252;&quot;&gt;  OPTIONS (table &quot;jtest1&quot;, keyspace &quot;test&quot;, pushdown &quot;true&quot;, directJoinSetting &quot;auto&quot;)&quot;&quot;&quot;&lt;/span&gt;)
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.sql(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;select * from tojoin tj inner join cassdata cd on tj.id = cd.pk&quot;&lt;/span&gt;)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
but if we look into execution plan, we can see that it doesn&#39;t happen:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.explain
== Physical Plan ==
*(2) BroadcastHashJoin [id#552], [pk#554], Inner, BuildLeft
:- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)))
:  +- *(1) Project [cast(id#550L as int) AS id#552]
:     +- *(1) Range (1, 5, step=1, splits=8)
+- *(2) Scan org.apache.spark.sql.cassandra.CassandraSourceRelation [pk#554,c1#555,c2#556,v#557] PushedFilters: [], ReadSchema: struct&amp;lt;pk:int,c1:int,c2:int,v:string&amp;gt;
&lt;/pre&gt;

&lt;p&gt;
This is investigated as &lt;a href=&quot;https://datastax-oss.atlassian.net/browse/SPARKC-613&quot;&gt;SPARKC-613&lt;/a&gt;, and hopefully will be fixed.
&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orgc69cb7b&quot;&gt;
&lt;h4 id=&quot;orgc69cb7b&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;&lt;/span&gt;Joins in RDD API&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-2&quot;&gt;
&lt;p&gt;
For a long time, RDD API was only a way to perform effective joins with data in Cassandra.   Spark Cassandra Connector &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/2_loading.md#using-joinwithcassandratable&quot;&gt;provides two functions for performing joins&lt;/a&gt;: &lt;code&gt;joinWithCassandraTable&lt;/code&gt; and &lt;code&gt;leftJoinWithCassandraTable&lt;/code&gt; - they exist in SCC for a long time (since version 1.2, released more than 5 years ago).  When executed, both functions return an instance of a special RDD type: &lt;code&gt;CassandraJoinRDD&lt;/code&gt; - it has all functions of &lt;code&gt;CassandraRDD&lt;/code&gt; API, plus one function (&lt;code&gt;.on&lt;/code&gt;) that specifies the list of columns on which join should be performed.
&lt;/p&gt;

&lt;p&gt;
Let&#39;s re-implement the same examples as above but with RDD API.  We&#39;re starting with partition key only:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;import&lt;/span&gt; com.datastax.spark.connector.&lt;span style=&quot;color: #a020f0;&quot;&gt;_&lt;/span&gt;

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;Tuple1&lt;/span&gt;(x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.joinWithCassandraTable(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
Please note that we need explicitly create &lt;code&gt;Tuple1&lt;/code&gt; objects, as &lt;code&gt;joinWithCassandraTable&lt;/code&gt; expects an RDD of tuples.  We can check that we got correct RDD type as result of execution:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.toDebugString
res21: String =
(8) CassandraJoinRDD[150] at RDD at CassandraRDD.scala:18 []
 |  ParallelCollectionRDD[147] at parallelize at &amp;lt;console&amp;gt;:33 []
&lt;/pre&gt;

&lt;p&gt;
The type of &lt;code&gt;joined&lt;/code&gt; is &lt;code&gt;CassandraJoinRDD[(Int,),CassandraRow]&lt;/code&gt;, and we can check that with &lt;code&gt;collect&lt;/code&gt;:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res22: Array[((Int,), com.datastax.spark.connector.CassandraRow)] = Array(
 ((1,),CassandraRow{pk: 1, c1: 1, c2: 1, v: t1-1-1}), 
 ((1,),CassandraRow{pk: 1, c1: 1, c2: 2, v: t1-1-2}), 
 ((1,),CassandraRow{pk: 1, c1: 2, c2: 1, v: t1-2-1}), 
 ((1,),CassandraRow{pk: 1, c1: 2, c2: 2, v: t1-2-2}), 
 ((2,),CassandraRow{pk: 2, c1: 1, c2: 1, v: t2-1-1}), 
 ((2,),CassandraRow{pk: 2, c1: 1, c2: 2, v: t2-1-2}), 
 ((2,),CassandraRow{pk: 2, c1: 2, c2: 1, v: t2-2-1}), 
 ((2,),CassandraRow{pk: 2, c1: 2, c2: 2, v: t2-2-2})
)
&lt;/pre&gt;

&lt;p&gt;
We can access data in &lt;code&gt;CassandraRow&lt;/code&gt; using the standard functions, like, &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/2_loading.md#reading-primitive-column-values&quot;&gt;getInt, getString, get, …&lt;/a&gt;, but it&#39;s not always handy.  To simplify work with results, both functions &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/4_mapper.md&quot;&gt;support mapping of the rows into tuples or into instances of specific (case) classes&lt;/a&gt;, that are much easier to use when doing processing of obtained data:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;CassData&lt;/span&gt;(pk&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;, c1&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;, c2&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;, v&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;)
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.joinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
and we&#39;ll get data in Cassandra mapped into our case class:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res23: Array[((Int,), CassData)] = Array(
 ((1,),CassData(1,1,1,t1-1-1)),
 ((1,),CassData(1,1,2,t1-1-2)),
 ((1,),CassData(1,2,1,t1-2-1)),
 ((1,),CassData(1,2,2,t1-2-2)),
 ((2,),CassData(2,1,1,t2-1-1)),
 ((2,),CassData(2,1,2,t2-1-2)),
 ((2,),CassData(2,2,1,t2-2-1)),
 ((2,),CassData(2,2,2,t2-2-2))
)
&lt;/pre&gt;

&lt;p&gt;
With partial partition key, behaviour is similar - create two elements tuple, and call &lt;code&gt;joinWithCassandraTable&lt;/code&gt;:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; (x.toInt, x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.joinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
and as expected, we&#39;re getting back four rows:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res28: Array[((Int, Int), CassData)] = Array(
 ((1,1),CassData(1,1,1,t1-1-1)),
 ((1,1),CassData(1,1,2,t1-1-2)),
 ((2,2),CassData(2,2,1,t2-2-1)),
 ((2,2),CassData(2,2,2,t2-2-2))
)
&lt;/pre&gt;

&lt;p&gt;
Similarly, for full primary key:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; (x.toInt, x.toInt, x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.joinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
that gives us two rows:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res29: Array[((Int, Int, Int), CassData)] = Array(
 ((1,1,1),CassData(1,1,1,t1-1-1)),
 ((2,2,2),CassData(2,2,2,t2-2-2))
)
&lt;/pre&gt;

&lt;p&gt;
Left join is done the same way as inner join, only another function is used:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; (x.toInt, x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.leftJoinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
But the type of &lt;code&gt;joined&lt;/code&gt; is &lt;code&gt;CassandraLeftJoinRDD&lt;/code&gt; instead of &lt;code&gt;CassandraJoinRDD&lt;/code&gt;, and instead of the &lt;code&gt;CassandraRow&lt;/code&gt; or instance of our class, we&#39;re getting &lt;code&gt;Option&lt;/code&gt; as the data for given key may absent in the Cassandra:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res38: Array[((Int, Int, Int), Option[CassData])] = Array(
 ((1,1,1),Some(CassData(1,1,1,t1-1-1))),
 ((1,1,1),Some(CassData(1,1,2,t1-1-2))),
 ((2,2,2),Some(CassData(2,2,1,t2-2-1))),
 ((2,2,2),Some(CassData(2,2,2,t2-2-2))),
 ((3,3,3),None),
 ((4,4,4),None)
)
&lt;/pre&gt;

&lt;p&gt;
All examples above did use the tuple to represent the data to join with.  But we can also use case classes here - we only need to have field names matching to columns in the table, like here (please note that we need to enforce list of columns, otherwise functions will use just a partition key - it&#39;s the same as for tuples):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;ToJoin&lt;/span&gt;(pk&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;, c1&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;)
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;ToJoin&lt;/span&gt;(x.toInt, x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.leftJoinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;))
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
In this case, it&#39;s easier to work with an instance of the case class instead of tuple:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res48: Array[(ToJoin, Option[CassData])] = Array(
 (ToJoin(1,1),Some(CassData(1,1,1,t1-1-1))),
 (ToJoin(1,1),Some(CassData(1,1,2,t1-1-2))),
 (ToJoin(2,2),Some(CassData(2,2,1,t2-2-1))),
 (ToJoin(2,2),Some(CassData(2,2,2,t2-2-2))),
 (ToJoin(3,3),None),
 (ToJoin(4,4),None))
&lt;/pre&gt;

&lt;p&gt;
Besides simple usage that was shown above, SCC provides more capabilities in the RDD API.  For example, we can &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/2_loading.md#performing-efficient-joins-with-cassandra-tables-since-12&quot;&gt;repartition RDD that we join with data in Cassandra&lt;/a&gt; to match partitioning of the data in Cassandra, so we can avoid non-local reads from Cassandra.
&lt;/p&gt;

&lt;p&gt;
Also, we can use &lt;a href=&quot;https://github.com/datastax/spark-cassandra-connector/blob/b2.5/doc/2_loading.md#cassandra-operations-on-a-cassandrajoinrdd&quot;&gt;any function of the &lt;code&gt;CassandraRDD&lt;/code&gt; API&lt;/a&gt;, such as, &lt;code&gt;select&lt;/code&gt;, &lt;code&gt;where&lt;/code&gt;, &lt;code&gt;limit&lt;/code&gt;, etc.  For example, we can limit the number of the returned rows by putting the condition onto the &lt;code&gt;c2&lt;/code&gt; column that will be applied to every partition (please note that it should be valid CQL expression!):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;ToJoin&lt;/span&gt;(pk&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;, c1&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Int&lt;/span&gt;)
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;toJoin&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt; until &lt;span style=&quot;color: darkcyan;&quot;&gt;5&lt;/span&gt;).map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;ToJoin&lt;/span&gt;(x.toInt, x.toInt))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; toJoin.leftJoinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;CassData&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;jtest1&quot;&lt;/span&gt;)
  .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;pk&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c1&quot;&lt;/span&gt;)).where(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;c2 &amp;gt; 1&quot;&lt;/span&gt;)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
so we get less data than in the previous example:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res55: Array[(ToJoin, Option[CassData])] = Array(
 (ToJoin(1,1),Some(CassData(1,1,2,t1-1-2))),
 (ToJoin(2,2),Some(CassData(2,2,2,t2-2-2))),
 (ToJoin(3,3),None),
 (ToJoin(4,4),None)
)
&lt;/pre&gt;

&lt;p&gt;
The &lt;code&gt;limit(N)&lt;/code&gt; call will return max N rows per Spark partition.  While &lt;code&gt;perPartitionLimit(N)&lt;/code&gt; will return max N rows per Cassandra partition.  This is quite useful, for example, when we&#39;re doing joins with some &quot;historical&quot; data, where we have multiple rows per partition, but usually need to join with the latest entry for a given partition.  For example, we may have a table that contains information about historical stock prices (sorted by timestamp in descending order):
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-cql&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;create table&lt;/span&gt; &lt;span style=&quot;color: blue;&quot;&gt;test.stock_price&lt;/span&gt; (
  ticker &lt;span style=&quot;color: forestgreen;&quot;&gt;text&lt;/span&gt;,
  tm &lt;span style=&quot;color: forestgreen;&quot;&gt;timestamp&lt;/span&gt;,
  price &lt;span style=&quot;color: forestgreen;&quot;&gt;double&lt;/span&gt;,
  &lt;span style=&quot;color: #a020f0;&quot;&gt;primary key&lt;/span&gt;(ticker, tm)
) &lt;span style=&quot;color: #a020f0;&quot;&gt;with clustering order by&lt;/span&gt; (tm &lt;span style=&quot;color: #a020f0;&quot;&gt;desc&lt;/span&gt;);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.stock_price (ticker, tm, price) &lt;span style=&quot;color: #a020f0;&quot;&gt;&lt;br /&gt;  values&lt;/span&gt; (&lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;MSFT&#39;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;2020-07-25T10:00:00Z&#39;&lt;/span&gt;, 100.0);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.stock_price (ticker, tm, price) &lt;span style=&quot;color: #a020f0;&quot;&gt;&lt;br /&gt;  values&lt;/span&gt; (&lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;MSFT&#39;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;2020-07-25T11:00:00Z&#39;&lt;/span&gt;, 101.0);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.stock_price (ticker, tm, price) &lt;span style=&quot;color: #a020f0;&quot;&gt;&lt;br /&gt;  values&lt;/span&gt; (&lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;MSFT&#39;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;2020-07-25T12:00:00Z&#39;&lt;/span&gt;, 99.0);
&lt;span style=&quot;color: #a020f0;&quot;&gt;insert into&lt;/span&gt; test.stock_price (ticker, tm, price) &lt;span style=&quot;color: #a020f0;&quot;&gt;&lt;br /&gt;  values&lt;/span&gt; (&lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;MSFT&#39;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&#39;2020-07-25T13:00:00Z&#39;&lt;/span&gt;, 97.0);
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
For example, if we have data about stocks coming from some source.  In this case we can join incoming data with latest prices for given shares, and perform some calculation:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;StockData&lt;/span&gt;(ticker&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, currentPrice&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;)
&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;StockPrice&lt;/span&gt;(ticker&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, tm&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;java&lt;/span&gt;.time.&lt;span style=&quot;color: darkcyan;&quot;&gt;Instant&lt;/span&gt;, price&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;)

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;stocks&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; sc.parallelize(&lt;span style=&quot;color: darkcyan;&quot;&gt;Seq&lt;/span&gt;(&lt;span style=&quot;color: darkcyan;&quot;&gt;StockData&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;MSFT&quot;&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;100&lt;/span&gt;), &lt;span style=&quot;color: darkcyan;&quot;&gt;StockData&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;INTC&quot;&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;200&lt;/span&gt;)))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; stocks.leftJoinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;StockPrice&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;stock_price&quot;&lt;/span&gt;)
   .on(&lt;span style=&quot;color: darkcyan;&quot;&gt;SomeColumns&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;ticker&quot;&lt;/span&gt;)).perPartitionLimit(&lt;span style=&quot;color: darkcyan;&quot;&gt;1&lt;/span&gt;)
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
After execution, we can see that we pulled the latest price for Microsoft shares:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;scala&amp;gt; joined.collect
res37: Array[(StockData, Option[StockPrice])] = Array(
  (StockData(MSFT,100.0),Some(StockPrice(MSFT,2020-07-25T13:00:00Z,97.0))), 
  (StockData(INTC,200.0),None)
)
&lt;/pre&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-orgf0c36f0&quot;&gt;
&lt;h4 id=&quot;orgf0c36f0&quot;&gt;Configuration options, optimizations, etc.&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-3&quot;&gt;
&lt;p&gt;
Spark Cassandra Connector has a number of configuration parameters that may affect execution of the joins.  Some of the configuration parameters could be specified globally, via instance of the &lt;code&gt;ReadConf&lt;/code&gt; class or via &lt;code&gt;option&lt;/code&gt;, while others could be specified only as table option.  
&lt;/p&gt;

&lt;p&gt;
With &lt;code&gt;spark.cassandra.concurrent.reads&lt;/code&gt; parameter we can control how many parallel requests will be sent per core when executing join (default: 512).  For example we can change it to a lower value if we want to decrease the load to cluster from doing joins, although this will increase processing time.
&lt;/p&gt;

&lt;p&gt;
Table-only options include (only for Dataframe API!):
&lt;/p&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;&lt;code&gt;directJoinSetting&lt;/code&gt; with possible values &lt;code&gt;on&lt;/code&gt; (always perform direct join), &lt;code&gt;off&lt;/code&gt; (disable direct join), and &lt;code&gt;auto&lt;/code&gt; - when SCC decides about use of direct join based on the specified threshold between size of the data to join, and data in Cassandra (default: &lt;code&gt;auto&lt;/code&gt;)&lt;/li&gt;
&lt;li&gt;&lt;code&gt;directJoinSizeRatio&lt;/code&gt; defines a threshold for switching to full scan (default: &lt;code&gt;0.9&lt;/code&gt;)&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org7ff1a77&quot;&gt;
&lt;h4 id=&quot;org7ff1a77&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;&lt;/span&gt;More practical example&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-4&quot;&gt;
&lt;p&gt;
The previous sections showed the basic usage of the joins with data in Cassandra.  This section is trying to show how to perform joins when processing streaming data.  Following project (&lt;a href=&quot;https://github.com/alexott/cassandra-dse-playground/tree/master/cassandra-join-spark&quot;&gt;full source code&lt;/a&gt;) demonstrates the use of joins with Dataframe &amp;amp; RDD APIs to perform enrichment of data coming from Kafka with data from Cassandra.  In our case, we&#39;re getting from Kafka the information about stocks (stock ticker, timestamp, and price), and enrich that data with more information about specific stock, like, full company name, type company, stock exchange, etc.  After enrichment, we just output data to the console, but the code could be adjusted to do something more useful with enriched data.  To run the code, just follow instructions in &lt;a href=&quot;https://github.com/alexott/cassandra-dse-playground/blob/master/cassandra-join-spark/README.md&quot;&gt;README&lt;/a&gt;.  
&lt;/p&gt;

&lt;p&gt;
The &lt;a href=&quot;https://github.com/alexott/cassandra-dse-playground/blob/master/cassandra-join-spark/src/main/scala/com/datastax/alexott/demos/streaming/StockTickersJoinDataFrames.scala&quot;&gt;implementation that uses Spark Structured Streaming&lt;/a&gt; is very straightforward: 
&lt;/p&gt;
&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;get data from Kafka&lt;/li&gt;
&lt;li&gt;decode JSON payload&lt;/li&gt;
&lt;li&gt;create dataframe for data in Cassandra (if we have &quot;static&quot; dataset in Cassandra, then we can use cache the data so they will be read only once)&lt;/li&gt;
&lt;li&gt;perform join (we use &lt;code&gt;joined.explain&lt;/code&gt; check that we got &quot;Cassandra Direct Join&quot;)&lt;/li&gt;
&lt;li&gt;output data to console&lt;/li&gt;
&lt;/ol&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;1.&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;streamingInputDF&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.readStream
  .format(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka&quot;&lt;/span&gt;)
  .option(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;kafka.bootstrap.servers&quot;&lt;/span&gt;, kafkaServes)
  .option(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;subscribe&quot;&lt;/span&gt;, topicName)
  .load()

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;2.&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;parsed&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; streamingInputDF.selectExpr(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;CAST(value AS STRING)&quot;&lt;/span&gt;)
  .select(from_json($&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;value&quot;&lt;/span&gt;, schema).as(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;stock&quot;&lt;/span&gt;))
  .select(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;stock.*&quot;&lt;/span&gt;)
  .withColumnRenamed(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;symbol&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;ticker&quot;&lt;/span&gt;)

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;3.&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;cassandra&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; spark.read
  .format(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;org.apache.spark.sql.cassandra&quot;&lt;/span&gt;)
  .options(&lt;span style=&quot;color: darkcyan;&quot;&gt;Map&lt;/span&gt;(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;table&quot;&lt;/span&gt; -&amp;gt; &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;stock_info&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;keyspace&quot;&lt;/span&gt; -&amp;gt; &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;))
  .load

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;4.&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; parsed.join(cassandra, cassandra(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;symbol&quot;&lt;/span&gt;) === parsed(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;ticker&quot;&lt;/span&gt;), &lt;span style=&quot;color: #8b2252;&quot;&gt;&lt;br /&gt;  &quot;left&quot;&lt;/span&gt;).drop(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;ticker&quot;&lt;/span&gt;)
joined.explain

&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;5.&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;query&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; joined.writeStream
      .outputMode(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;update&quot;&lt;/span&gt;)
      .format(&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;console&quot;&lt;/span&gt;)
      .start()
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
And when we execute it then we can see the data printed to console, like this:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;+------------------+--------------------+------+----------+--------+--------------+--------------------+
|             value|            datetime|symbol|base_price|exchange|      industry|                name|
+------------------+--------------------+------+----------+--------+--------------+--------------------+
| 254.5442902345344|2020-07-14 14:03:...|  ADBE|     253.0|  NASDAQ|          TECH|       ADOBE SYSTEMS|
| 66.13761365408801|2020-07-14 14:03:...|   LNC|      66.0|    NYSE|    FINANCIALS|    LINCOLN NATIONAL|
| 37.18736354960266|2020-07-14 14:04:...|   AAL|      37.0|  NASDAQ|TRANSPORTATION|AMERICAN TRANSPOR...|
+------------------+--------------------+------+----------+--------+--------------+--------------------+
&lt;/pre&gt;


&lt;p&gt;
The &lt;a href=&quot;https://github.com/alexott/cassandra-dse-playground/blob/master/cassandra-join-spark/src/main/scala/com/datastax/alexott/demos/streaming/StockTickersJoinRDD.scala&quot;&gt;implementation that uses RDD-based Spark Streaming&lt;/a&gt; follows the same steps as previous example, although it is slightly more complicated, because it&#39;s doing more than dataframe-based implementation - it filters out entries for which we didn&#39;t find data in Cassandra, and prints only entries for which we have data in Cassandra:
&lt;/p&gt;

&lt;div class=&quot;org-src-container&quot;&gt;
&lt;pre class=&quot;src src-scala&quot;&gt;&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;StockData&lt;/span&gt;(symbol&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, timestamp&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Instant&lt;/span&gt;, price&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;) &lt;span style=&quot;color: #a020f0;&quot;&gt;&lt;br /&gt;   extends&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Serializable&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;StockInfo&lt;/span&gt;(symbol&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, exchange&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, name&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, &lt;br /&gt;   industry&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, base_price&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;) &lt;span style=&quot;color: #a020f0;&quot;&gt;extends&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Serializable&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;case&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;class&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;JoinedData&lt;/span&gt;(symbol&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, exchange&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, name&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, &lt;br /&gt;   industry&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;String&lt;/span&gt;, base_price&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;, timestamp&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Instant&lt;/span&gt;, price&lt;span style=&quot;color: #a020f0;&quot;&gt;:&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Double&lt;/span&gt;) &lt;br /&gt;   &lt;span style=&quot;color: #a020f0;&quot;&gt;extends&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;Serializable&lt;/span&gt;

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;ssc&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;new&lt;/span&gt; &lt;span style=&quot;color: forestgreen;&quot;&gt;StreamingContext&lt;/span&gt;(sc, &lt;span style=&quot;color: darkcyan;&quot;&gt;Seconds&lt;/span&gt;(&lt;span style=&quot;color: darkcyan;&quot;&gt;10&lt;/span&gt;))
&lt;span style=&quot;color: #7f7f7f;&quot;&gt;// &lt;/span&gt;&lt;span style=&quot;color: #7f7f7f;&quot;&gt;....&lt;/span&gt;
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;stream&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; &lt;span style=&quot;color: darkcyan;&quot;&gt;KafkaUtils&lt;/span&gt;.createDirectStream[&lt;span style=&quot;color: darkcyan;&quot;&gt;String&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;String&lt;/span&gt;](
  ssc, &lt;span style=&quot;color: darkcyan;&quot;&gt;PreferConsistent&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;Subscribe&lt;/span&gt;[&lt;span style=&quot;color: darkcyan;&quot;&gt;String&lt;/span&gt;, &lt;span style=&quot;color: darkcyan;&quot;&gt;String&lt;/span&gt;](topics, kafkaParams)
)

&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;parsedData&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; stream.flatMap(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; parseJson(x.value()))
&lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;transformedData&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; parsedData.transform(rdd &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; {
  &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;joined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; rdd.leftJoinWithCassandraTable[&lt;span style=&quot;color: darkcyan;&quot;&gt;StockInfo&lt;/span&gt;](&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;test&quot;&lt;/span&gt;, &lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;stock_info&quot;&lt;/span&gt;)
  joined.persist()
  &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;missingInfoCount&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; joined.filter(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; x._2.isEmpty).count()
  &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;stocksWithInfo&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; joined.filter(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; x._2.isDefined)
  &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;existingInfoCount&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; stocksWithInfo.count()
  println(s&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;There are &lt;/span&gt;&lt;span style=&quot;color: sienna;&quot;&gt;$missingInfoCount&lt;/span&gt;&lt;span style=&quot;color: #8b2252;&quot;&gt; stock tickers without information in Cassandra&quot;&lt;/span&gt;)
  println(s&lt;span style=&quot;color: #8b2252;&quot;&gt;&quot;There are &lt;/span&gt;&lt;span style=&quot;color: sienna;&quot;&gt;$existingInfoCount&lt;/span&gt;&lt;span style=&quot;color: #8b2252;&quot;&gt; stock tickers with information in Cassandra&quot;&lt;/span&gt;)
  &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;combined&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; stocksWithInfo.map(x &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; {
    &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;i&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; x._2.get
    &lt;span style=&quot;color: #a020f0;&quot;&gt;val&lt;/span&gt; &lt;span style=&quot;color: sienna;&quot;&gt;d&lt;/span&gt; &lt;span style=&quot;color: #a020f0;&quot;&gt;=&lt;/span&gt; x._1
    &lt;span style=&quot;color: darkcyan;&quot;&gt;JoinedData&lt;/span&gt;(i.symbol, i.exchange, i.name, i.industry, i.base_price, &lt;br /&gt;       d.timestamp, d.price)
  })
  joined.unpersist()
  combined
})
transformedData.foreachRDD(rdd &lt;span style=&quot;color: #a020f0;&quot;&gt;=&amp;gt;&lt;/span&gt; rdd.foreach(println))
ssc.start()
&lt;/pre&gt;
&lt;/div&gt;

&lt;p&gt;
and when it&#39;s running, we&#39;ll see on the console following messages:
&lt;/p&gt;

&lt;pre class=&quot;example&quot;&gt;There are 0 stock tickers without information in Cassandra
There are 20 stock tickers with information in Cassandra
...
JoinedData(ESND,NASDAQ,ESSENDANT,WHOLESALERS,13.0,2020-07-14T16:19:19.588Z,13.483634952551117)
JoinedData(SWK,NYSE,STANLEY BLACK &amp;amp; DECKER,HOUSEHOLD PRODUCTS,128.0,2020-07-14T16:19:23.588Z,121.58327281753643)
JoinedData(BLK,NYSE,BLACKROCK,FINANCIALS,424.0,2020-07-14T16:19:24.588Z,394.7030616365362)
&lt;/pre&gt;
&lt;/div&gt;
&lt;/div&gt;

&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org2c657c2&quot;&gt;
&lt;h4 id=&quot;org2c657c2&quot;&gt;&lt;span class=&quot;section-number-4&quot;&gt;&lt;/span&gt;Conclusion&lt;/h4&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-3-5&quot;&gt;
&lt;p&gt;
Joining with data in Cassandra is a very convenient and fast method for data enrichment - with a small amount of code we can quickly pull necessary data from the database, and perform data processing based on the retrieved data.
&lt;/p&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/29732542745204182/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/29732542745204182' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/29732542745204182'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/29732542745204182'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/07/spark-effective-joins-with-cassandra.html' title='Spark &amp; efficient joins with Cassandra'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjY6cF3M0n8ZteH4EextITAHEN8kGvn6rfEkRvoL3zy-4oguOIQh-o3DzZZJnKtu1eLMcBOI2JqGzSJOY9ZGB8DTVC7ygHNdxyzaT04KDVC5CY3f9Pq6L7au7ibs_4DQfj40H6O8g/s72-w625-h189-c/data-enrichment.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-5936357249627989908</id><published>2020-07-23T17:08:00.002+02:00</published><updated>2020-07-23T17:11:28.237+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="datastax"/><category scheme="http://www.blogger.com/atom/ns#" term="dse"/><category scheme="http://www.blogger.com/atom/ns#" term="spark"/><category scheme="http://www.blogger.com/atom/ns#" term="zeppelin"/><title type='text'>Using Apache Zeppelin to work with data in DSE via AlwaysOn SQL Service</title><content type='html'>Since release 6.0
DataStax Enterprise (DSE) includes &lt;a href=&quot;https://www.datastax.com/blog/2018/05/introducing-alwayson-sql-dse-analytics&quot;&gt;AlwaysOn SQL Service (AOSS)&lt;/a&gt; that allows to connect to DSE Analytics via JDBC or ODBC drivers and execute Spark SQL queries against data in DSE, or external sources, such as, data on DSEFS.  AOSS is built on the top of the Spark Thrift Server, but has a number of improvements, such as, improved fault tolerance, support for advanced security features of DSE (for example, row-level access control),  better caching of the data to improve response time on restarts, etc.  Using AOSS people can use their favorite BI tools to access data stored in DSE, and this greatly simplifies work with that data.
&lt;br /&gt;
&lt;a href=&quot;https://zeppelin.apache.org/&quot;&gt;Apache Zeppelin&lt;/a&gt; has a &lt;a href=&quot;https://zeppelin.apache.org/docs/0.9.0-preview1/interpreter/jdbc.html&quot;&gt;dedicated interpreter&lt;/a&gt; for accessing databases via JDBC and documentation contains all information on how to configure and use this interpreter, with examples for many popular databases, such as, PostgreSQL, MySQL, etc.  JDBC interpreter also supports dynamic forms, and interpolation of variables to simplify creation of interactive &amp;amp; dynamic queries.  So we can also use Apache Zeppelin to work with data in DSE via JDBC interpreter. 
&lt;br /&gt;
&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org407dbc9&quot;&gt;
&lt;h3 id=&quot;org407dbc9&quot; style=&quot;text-align: left;&quot;&gt;
Configuring Zeppelin to work with AOSS&lt;/h3&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-2-1&quot;&gt;
To access data via AOSS we need to get a special version of JDBC driver that supports AOSS enhancements, such as, auto-discovery of AOSS instance, or reconnection to another server if AOSS fails.  We need to get &quot;Simba JDBC Driver for Apache Spark&quot; from the &lt;a href=&quot;https://downloads.datastax.com/#odbc-jdbc-drivers&quot;&gt;corresponding section of DataStax download site&lt;/a&gt;. (besides driver it makes sense to download the driver manual as well, as it describes all driver options).  After the driver is downloaded, we need to unpack the archive to a place accessible by Zeppelin.  Archive should contain a file with the name &lt;code&gt;&quot;SparkJDBC41.jar&lt;/code&gt;&lt;code&gt;&quot;&lt;/code&gt;.  
&lt;br /&gt;
Now we can configure Zeppelin to connect to AOSS by going to &lt;code&gt;&quot;&lt;/code&gt;Interpreters&lt;code&gt;&quot;&lt;/code&gt; section in the top right drop-down that shows the user name. We can configure existing instance of the JDBC interpreter, but it&#39;s usually recommended to create a new interpreter based on the JDBC interpreter template for each type of the used database.  Click &lt;code&gt;&quot;&lt;/code&gt;+Create&lt;code&gt;&quot;&lt;/code&gt; button, enter interpreter name, like &lt;code&gt;aoss&lt;/code&gt; (it will be used to specify interpreter on the cell level, like,&amp;nbsp;&lt;code&gt;&lt;code&gt;&quot;&lt;/code&gt;%aoss&lt;/code&gt;&lt;code&gt;&quot;&lt;/code&gt;), and select &lt;code&gt;&quot;&lt;/code&gt;&lt;code&gt;jdbc&lt;/code&gt;&lt;code&gt;&quot;&lt;/code&gt; in &quot;Interpreter group&quot; drop-down - this will load all existing properties of JDBC interpreter, that we can fill with information specific for AOSS:
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg-XnS1NJWzpXQ9cueysxp5uJH0z3I46GVDEODJ4QVwR1mSgkjKLbKmsf6fK4Emj_L1MyQT__ni-7cjZV_RBbMU6KCUqOQpP67YNwo5wxo-IZvYsGRgyJQzUI40jtETbKR5QdMZDg/s1149/zeppelin-aoss-create-interpreter.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;378&quot; data-original-width=&quot;1149&quot; height=&quot;211&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg-XnS1NJWzpXQ9cueysxp5uJH0z3I46GVDEODJ4QVwR1mSgkjKLbKmsf6fK4Emj_L1MyQT__ni-7cjZV_RBbMU6KCUqOQpP67YNwo5wxo-IZvYsGRgyJQzUI40jtETbKR5QdMZDg/w640-h211/zeppelin-aoss-create-interpreter.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
We need to configure several things that are common for all AOSS installations:
&lt;br /&gt;
&lt;ul class=&quot;org-ul&quot;&gt;
&lt;li&gt;we need to put full path to Simba JDBC driver into &quot;artifact&quot; field of &quot;Dependencies&quot; section (like, &lt;code&gt;/Users/ott/work/zeppelin/SparkJDBC41.jar&lt;/code&gt;)&lt;/li&gt;
&lt;li&gt;we need to put driver&#39;s class name (&lt;code&gt;com.simba.spark.jdbc41.Driver&lt;/code&gt;) for configuration parameter &lt;code&gt;default.driver&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
We also must specify value for configuration parameter &lt;code&gt;default.url&lt;/code&gt;.  For AOSS, there are two ways to do it:
&lt;br /&gt;
&lt;ol class=&quot;org-ol&quot;&gt;
&lt;li&gt;Explicitly specify host name or IP-address with port configured for AOSS (&lt;code&gt;10000&lt;/code&gt; by default, configured by &lt;code&gt;alwayson_sql_options:thrift_port&lt;/code&gt; setting in &lt;code&gt;dse.yaml&lt;/code&gt;), like, &lt;code&gt;jdbc:spark://server:10000&lt;/code&gt; - although this method works, but it&#39;s not optimal as it requires to know which of servers is running AOSS right now, and no connection to another server will happen in case of failover&lt;/li&gt;
&lt;li&gt;Use auto-discovery functionality of the driver that relies on the meta-information published by every node of DSE Analytics (by default on the port &lt;code&gt;9077&lt;/code&gt;, configured by &lt;code&gt;alwayson_sql_options:web_ui_port&lt;/code&gt; setting in &lt;code&gt;dse.yaml&lt;/code&gt;).  In this case, the driver will automatically discover where the instance of AOSS is running, and also perform connection to the new node if the current node fails.   For this case, URL looks as following: &lt;code&gt;jdbc:spark://AOSSStatusEndpoints=server1:9077,server2:9077;&lt;/code&gt; (we can specify any number of nodes as parameter)&lt;/li&gt;
&lt;/ol&gt;
We can pass additional driver options by adding them to the URL.  Refer to the driver documentation for a list of the available options.  We can also configure other Zeppelin parameters, but we can leave them with default values.  After everything is configured, press &quot;Save&quot; to save changes (I removed not necessary parameters to make screenshot smaller):
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiAynb9RptHNGj5_20K4tHjwuyfTjFDnLm3aObpu8JZU6Ik-LkKCiPVm42-ANHVWFTtMXmCUB_tPH6enlrzNM4hbOrqmYONEXuygaU_3Yr-Q4yNSTjsGBiWPyQiUhMNgF4fVjH4xA/s1135/zeppelin-aoss-configure.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;411&quot; data-original-width=&quot;1135&quot; height=&quot;232&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiAynb9RptHNGj5_20K4tHjwuyfTjFDnLm3aObpu8JZU6Ik-LkKCiPVm42-ANHVWFTtMXmCUB_tPH6enlrzNM4hbOrqmYONEXuygaU_3Yr-Q4yNSTjsGBiWPyQiUhMNgF4fVjH4xA/w640-h232/zeppelin-aoss-configure.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div class=&quot;outline-4&quot; id=&quot;outline-container-org0d15d39&quot;&gt;
&lt;h3 id=&quot;org0d15d39&quot; style=&quot;text-align: left;&quot;&gt;
Usage&lt;/h3&gt;
&lt;div class=&quot;outline-text-4&quot; id=&quot;text-1-2-2&quot;&gt;
After we create the interpreter, we can start to use it either in the new notebooks, or in existing ones.  We can configure interpreter on the notebook level when creating it, or we can put &lt;code&gt;%interpreter_name&lt;/code&gt; at the beginning of the cell, to indicate that we&#39;re using a specific interpreter.
&lt;br /&gt;
And everything that we need to do now - just issue Spark SQL queries, and wait for results, like this:
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh6O8ShXlLn6Vw4dqtWSYGIhib8gLBSgzVubCnomJQ6x2NULyqw_feBZBvVgJlmobHetIs0Rb1SAclUXJCLOXrG3reCsRSqGOXlvj0nMGmXOCPfxvbiV14I8nBqDgyNuoWPjmgETg/s1153/zeppelin-aoss-query.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;546&quot; data-original-width=&quot;1153&quot; height=&quot;304&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh6O8ShXlLn6Vw4dqtWSYGIhib8gLBSgzVubCnomJQ6x2NULyqw_feBZBvVgJlmobHetIs0Rb1SAclUXJCLOXrG3reCsRSqGOXlvj0nMGmXOCPfxvbiV14I8nBqDgyNuoWPjmgETg/w640-h304/zeppelin-aoss-query.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
We can check that the same data is available via CQL (don&#39;t wonder about syntax - this table has DSE Search index):
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjcX3GByGHTcIgvZNZ2wvtzkOTqASo2D0vQpg-ca4l6-J2x9qKI5nhd1Pe6ua4rke31n5ptlpyAYBcxKaegdcKe_jKq-d9H887KxqP0eupRJlZx8zF4IMKhFls0cRh1mcm4K4afBQ/s1108/zeppelin-aoss-query-cql.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;297&quot; data-original-width=&quot;1108&quot; height=&quot;172&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjcX3GByGHTcIgvZNZ2wvtzkOTqASo2D0vQpg-ca4l6-J2x9qKI5nhd1Pe6ua4rke31n5ptlpyAYBcxKaegdcKe_jKq-d9H887KxqP0eupRJlZx8zF4IMKhFls0cRh1mcm4K4afBQ/w640-h172/zeppelin-aoss-query-cql.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
We can also use all available visualizations, including additional, like &lt;i&gt;geospark-zeppelin&lt;/i&gt;, that is installed from the Helium registry:
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgy_bX-TTrOA2iWaA8M-qDE_I1t77or8NMqDUOq7sKfXvVrMOYu42EldBG57h8Iy-dzgUsw5nX5YWiLB8AhPB0FTv_4s-EM7JAUUXX-lGJhGiPHNa6kB3fLLE1dUC6Ffc59fHDVtA/s1111/zeppelin-aoss-query-geomap.png&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;449&quot; data-original-width=&quot;1111&quot; height=&quot;258&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgy_bX-TTrOA2iWaA8M-qDE_I1t77or8NMqDUOq7sKfXvVrMOYu42EldBG57h8Iy-dzgUsw5nX5YWiLB8AhPB0FTv_4s-EM7JAUUXX-lGJhGiPHNa6kB3fLLE1dUC6Ffc59fHDVtA/w640-h258/zeppelin-aoss-query-geomap.png&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;h3 style=&quot;text-align: left;&quot;&gt;
Conclusion&lt;/h3&gt;
This post demonstrates flexibility and ease of use of the Apache Zeppelin when working with different technologies, such as databases, etc.&lt;br /&gt;
P.S. this post was written using Zeppelin 0.9.0-preview1&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/5936357249627989908/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/5936357249627989908' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5936357249627989908'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/5936357249627989908'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/07/using-zeppelin-to-work-with-data-in-dse.html' title='Using Apache Zeppelin to work with data in DSE via AlwaysOn SQL Service'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg-XnS1NJWzpXQ9cueysxp5uJH0z3I46GVDEODJ4QVwR1mSgkjKLbKmsf6fK4Emj_L1MyQT__ni-7cjZV_RBbMU6KCUqOQpP67YNwo5wxo-IZvYsGRgyJQzUI40jtETbKR5QdMZDg/s72-w640-h211-c/zeppelin-aoss-create-interpreter.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-6862508.post-6966808613791304468</id><published>2020-06-23T12:01:00.001+02:00</published><updated>2020-06-23T13:02:06.897+02:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="astra"/><category scheme="http://www.blogger.com/atom/ns#" term="cassandra"/><category scheme="http://www.blogger.com/atom/ns#" term="datastax"/><category scheme="http://www.blogger.com/atom/ns#" term="zeppelin"/><title type='text'>Working with DataStax Astra from Apache Zeppelin</title><content type='html'>&lt;a href=&quot;https://zeppelin.apache.org/&quot;&gt;Apache Zeppelin&lt;/a&gt; is very powerful web-based environment for collaborative work with very good support for the big data technologies and databases, such as, Spark, Flink, Cassandra, and many others.  Apache Zeppelin 0.9.0 will include a lot of changes for Cassandra interpreter:&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;migration to the &lt;a href=&quot;https://docs.datastax.com/en/developer/java-driver/latest/&quot;&gt;new, unified DataStax Java driver&lt;/a&gt; that brings more performance &amp;amp; stability, and also support for DSE-specific functionality, such as, geospatial types&lt;/li&gt;
&lt;li&gt;flexible formatting of results - we can output data in CQL, or human-readable formats, format time/date-related columns using custom patterns, control formatting of floating point numbers, etc. All of this could be configured on interpreter and/or cell level&lt;/li&gt;
&lt;li&gt;ability to change any configuration parameter of the Java driver&lt;/li&gt;
&lt;/ul&gt;
The last item is the most important one for connecting to &lt;a href=&quot;https://astra.datastax.com/&quot;&gt;DataStax Astra&lt;a/&gt; (Cassandra as a Service from DataStax) - we can specify the path to secure connect bundle, and get access to our Astra instance.&lt;br /&gt;
&lt;br /&gt;
Right now, there is no precompiled version of Zeppelin with these changes available, so you will need to compile Zeppelin from sources. After compilation is done, start Zeppelin, and open in web browser default Zeppelin address: http://localhost:8080/.&lt;br /&gt;&lt;br /&gt;
We can configure Cassandra interpreter directly to work with Astra, but often it&#39;s better to create a separate interpreter (like is shown on the picture below):&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;go to the &quot;Interpreter&quot; menu (in drop down in the top right corner), and there click &quot;Create&quot;&lt;/li&gt;
&lt;li&gt;enter the name of the interpreter (&lt;i&gt;astra&lt;/i&gt;), and select the &lt;i&gt;cassandra&lt;/i&gt; in the interpreter group dropdown&lt;/li&gt;
&lt;li&gt;enter the username/password in the &lt;tt&gt;cassandra.credentials.username&lt;/tt&gt; &amp;amp; &lt;tt&gt;cassandra.credentials.password properties&lt;/tt&gt;&lt;/li&gt;
&lt;li&gt;clear the value of the &lt;tt&gt;cassandra.hosts&lt;/tt&gt; property (this is temporary workaround until it&#39;s fixed on the driver level)&lt;/li&gt;
&lt;li&gt;change the value of &lt;tt&gt;cassandra.query.default.consistency&lt;/tt&gt; to &lt;tt&gt;LOCAL_QUORUM&lt;/tt&gt;, and &lt;tt&gt;cassandra.query.default.serial.consistency&lt;/tt&gt; to &lt;tt&gt;LOCAL_SERIAL&lt;/tt&gt; as &lt;a href=&quot;https://docs.datastax.com/en/astra/aws/doc/dscloud/astra/dscloudDatabaseConditions.html&quot;&gt;Astra requires this&amp;nbsp; to perform write operations&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;optionally change the value of &lt;tt&gt;cassandra.keyspace&lt;/tt&gt; to the name of keyspace that was created in Astra&lt;/li&gt;
&lt;li&gt;add a property with name &lt;tt&gt;datastax-java-driver.basic.cloud.secure-connect-bundle&lt;/tt&gt; and value of the path to the secure bundle
    save the interpreter - this enables connection to Astra&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;/div&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhl-DNGbzzbfZ08w0JpgkW-80xaglBILdXMuCUmnN1tu_hSSGrE9fjmpKASE7vFdPXgaJbQkSwUmeSiIcEovkB9-c3IhY8AyliLXbFGclCrzfGbqowfruxyCXewqBHpN8-U65mFbQ/s1600/Screen+Shot+2020-06-23+at+09.06.19.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;888&quot; data-original-width=&quot;1233&quot; height=&quot;576&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhl-DNGbzzbfZ08w0JpgkW-80xaglBILdXMuCUmnN1tu_hSSGrE9fjmpKASE7vFdPXgaJbQkSwUmeSiIcEovkB9-c3IhY8AyliLXbFGclCrzfGbqowfruxyCXewqBHpN8-U65mFbQ/s640/Screen+Shot+2020-06-23+at+09.06.19.png&quot; width=&quot;800&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
Using the new interpreter is easy:
&lt;ul&gt;
&lt;li&gt;Click &quot;Create new note&quot;
    Enter the note name, and select &lt;i&gt;astra&lt;/i&gt; as default interpreter.&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEibNjhyphenhyphentBOeOTlKCH5xBGk7XZc8-EeesMD8hKP7bKE_nbuLYJEQrbp6K9GS_C88zhkA4AhjalOMrJC8HYLzTtC_QmS3FkBaRrh4oiMGhX2hy4Qc_ullcGqw6j1Boi_BT8hJxQiicg/s1600/Screen+Shot+2020-06-23+at+09.08.39.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;458&quot; data-original-width=&quot;746&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEibNjhyphenhyphentBOeOTlKCH5xBGk7XZc8-EeesMD8hKP7bKE_nbuLYJEQrbp6K9GS_C88zhkA4AhjalOMrJC8HYLzTtC_QmS3FkBaRrh4oiMGhX2hy4Qc_ullcGqw6j1Boi_BT8hJxQiicg/s400/Screen+Shot+2020-06-23+at+09.08.39.png&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;Start to execute commands, for example, execute &lt;tt&gt;describe cluster;&lt;/tt&gt; that should show something like this:&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhdMi7IDnUCBBpZLD7Dl_X_11i1Ct1OeKf67Oi3G_kjwAdghI54aeB0VyLn9lPdGxXv8iA3zE2v5sIkMgt4OUjZA6rGlhdYTsONDPNrJyp1jaNFAO2ZSMIPq0etFkjKegr9T-ARng/s1600/Screen+Shot+2020-06-23+at+09.14.54.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;508&quot; data-original-width=&quot;1390&quot; height=&quot;292&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhdMi7IDnUCBBpZLD7Dl_X_11i1Ct1OeKf67Oi3G_kjwAdghI54aeB0VyLn9lPdGxXv8iA3zE2v5sIkMgt4OUjZA6rGlhdYTsONDPNrJyp1jaNFAO2ZSMIPq0etFkjKegr9T-ARng/s640/Screen+Shot+2020-06-23+at+09.14.54.png&quot; width=&quot;800&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;Create table, insert data, and select them:&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjaK_JBvAnImhTLqnOmIlHfRm6TAx_Oi-Opc8appgyyXhyphenhyphenqZfFtT49X-OF2yUYB0FPAWnf13uZykS91z4BlPFk5g3JpJcINSZzOScR3ylDpjbFTTKR53w-neenmQ-GkBs_igdxtyg/s1600/Screen+Shot+2020-06-23+at+11.50.28.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;699&quot; data-original-width=&quot;1138&quot; height=&quot;491&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjaK_JBvAnImhTLqnOmIlHfRm6TAx_Oi-Opc8appgyyXhyphenhyphenqZfFtT49X-OF2yUYB0FPAWnf13uZykS91z4BlPFk5g3JpJcINSZzOScR3ylDpjbFTTKR53w-neenmQ-GkBs_igdxtyg/s640/Screen+Shot+2020-06-23+at+11.50.28.png&quot; width=&quot;800&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
Please note that the new interpreter inherits all functionality of the base interpreter, for example, it&#39;s possible to specify formatting options, like this (formatting using German locale, for German timezone):&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh1YTPsXBENQ3Ly1TM2HB3tq0PQSiqvG5mPcGuQx8VTWQt8A0VF7xnaH7pFxAN6xJE6krn-rnxjdUu_dVdqFa6EsZgmRFsRZ6Ltcndf-XRUssh_mnbXgDsfZu6hAGw4NfNiYd_7Tg/s1600/Screen+Shot+2020-06-23+at+10.41.28.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;448&quot; data-original-width=&quot;1070&quot; height=&quot;335&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh1YTPsXBENQ3Ly1TM2HB3tq0PQSiqvG5mPcGuQx8VTWQt8A0VF7xnaH7pFxAN6xJE6krn-rnxjdUu_dVdqFa6EsZgmRFsRZ6Ltcndf-XRUssh_mnbXgDsfZu6hAGw4NfNiYd_7Tg/s640/Screen+Shot+2020-06-23+at+10.41.28.png&quot; width=&quot;800&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://alexott.blogspot.com/feeds/6966808613791304468/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment/fullpage/post/6862508/6966808613791304468' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6966808613791304468'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/6862508/posts/default/6966808613791304468'/><link rel='alternate' type='text/html' href='http://alexott.blogspot.com/2020/06/working-with-datastax-astra-from-apache.html' title='Working with DataStax Astra from Apache Zeppelin'/><author><name>Alex Ott</name><uri>http://www.blogger.com/profile/13001951608173211050</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='25' height='32' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjZbs-KeNgwHzuMovddm11TJ8k6o1XXIwsYnJtZEwXDTWXAr9ZX1YH5Z8Dq5mCu9soZ2sY2S2BtA-6IMhv1F6uZtzooMPuaHx7h6wpEHz9Qdk8aechVbR5wE3WPfvZxHA/s220/avatar2.jpg'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhl-DNGbzzbfZ08w0JpgkW-80xaglBILdXMuCUmnN1tu_hSSGrE9fjmpKASE7vFdPXgaJbQkSwUmeSiIcEovkB9-c3IhY8AyliLXbFGclCrzfGbqowfruxyCXewqBHpN8-U65mFbQ/s72-c/Screen+Shot+2020-06-23+at+09.06.19.png" height="72" width="72"/><thr:total>0</thr:total></entry></feed>