
<!DOCTYPE html>


<html lang="en" data-content_root="./" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>Philip May - Data Science and IT &#8212; Philip May</title>
  
  
  
  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>
  <!--
    this give us a css class that will be invisible only if js is disabled
  -->
  <noscript>
    <style>
      .pst-js-only { display: none !important; }

    </style>
  </noscript>
  
  <!-- Loaded before other Sphinx assets -->
  <link href="_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />

    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=8f2a1f02" />
    <link rel="stylesheet" type="text/css" href="_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
    <link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
    <link rel="stylesheet" type="text/css" href="_static/sphinx-design.min.css?v=95c83b7e" />
  
  <!-- So that users can add custom icons -->
  <script src="_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />

    <script src="_static/documentation_options.js?v=7f41d439"></script>
    <script src="_static/doctools.js?v=9bcbadda"></script>
    <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/clipboard.min.js?v=a7894cd8"></script>
    <script src="_static/copybutton.js?v=f281be69"></script>
    <script src="_static/design-tabs.js?v=f930bc37"></script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'index';</script>
    <script src="_static/js/matomo.js?v=dbb1f055"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Machine Learning" href="machine-learning.html" /> 
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  <meta name="docsearch:version" content="" /> 
<link
  rel="alternate"
  type="application/atom+xml"
  href="blog/atom.xml"
  title="Philip May - Blog"
/>
  
  </head>
  
  
  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">

  
  
  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
  
  <div id="pst-scroll-pixel-helper"></div>
  
  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>

  
  <dialog id="pst-search-dialog">
    
<form class="bd-search d-flex align-items-center"
      action="search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         placeholder="Search this site..."
         aria-label="Search this site..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
  </dialog>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>

  
    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
    <span class="fa-solid fa-bars"></span>
  </button>
  
  
  <div class="col-lg-3 navbar-header-items__start">
    
      <div class="navbar-item">

  
    
  

<a class="navbar-brand logo" href="#">
  
  
  
  
  
  
    <p class="title logo__title">Philip May</p>
  
</a></div>
    
  </div>
  
  <div class="col-lg-9 navbar-header-items">
    
    <div class="me-auto navbar-header-items__center">
      
        <div class="navbar-item">
<nav>
  <ul class="bd-navbar-elements navbar-nav">
    
<li class="nav-item ">
  <a class="nav-link nav-internal" href="machine-learning.html">
    Machine Learning
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="python.html">
    Python
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="it.html">
    IT
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="linux.html">
    Linux
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="blog.html">
    Blog
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="about-me.html">
    About Me
  </a>
</li>

  </ul>
</nav></div>
      
    </div>
    
    
    <div class="navbar-header-items__end">
      
        <div class="navbar-item navbar-persistent--container">
          

<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
 <i class="fa-solid fa-magnifying-glass"></i>
 <span class="search-button__default-text">Search</span>
 <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
        </div>
      
      
        <div class="navbar-item">

<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
</button></div>
      
    </div>
    
  </div>
  
  
    <div class="navbar-persistent--mobile">

<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
 <i class="fa-solid fa-magnifying-glass"></i>
 <span class="search-button__default-text">Search</span>
 <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
    </div>
  

  
    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
      <span class="fa-solid fa-outdent"></span>
    </button>
  
</div>

    </header>
  

  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">
      
      
      
        
      
      <dialog id="pst-primary-sidebar-modal"></dialog>
      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar hide-on-wide">
        

  
  <div class="sidebar-header-items sidebar-primary__section">
    
    
      <div class="sidebar-header-items__center">
        
          
          
            <div class="navbar-item">
<nav>
  <ul class="bd-navbar-elements navbar-nav">
    
<li class="nav-item ">
  <a class="nav-link nav-internal" href="machine-learning.html">
    Machine Learning
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="python.html">
    Python
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="it.html">
    IT
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="linux.html">
    Linux
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="blog.html">
    Blog
  </a>
</li>


<li class="nav-item ">
  <a class="nav-link nav-internal" href="about-me.html">
    About Me
  </a>
</li>

  </ul>
</nav></div>
          
        
      </div>
    
    
    
      <div class="sidebar-header-items__end">
        
          <div class="navbar-item">

<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
</button></div>
        
      </div>
    
  </div>
  
  
  <div class="sidebar-primary-items__end sidebar-primary__section">
      <div class="sidebar-primary-item">
<div id="ethical-ad-placement"
      class="flat"
      data-ea-publisher="readthedocs"
      data-ea-type="readthedocs-sidebar"
      data-ea-manual="true">
</div></div>
  </div>


      </div>
      
      <main id="main-content" class="bd-main" role="main">
        
        
          <div class="bd-content">
            <div class="bd-article-container">
              
              <div class="bd-header-article d-print-none"></div>
              
              
              
                
<div id="searchbox"></div>
                <article class="bd-article">
                   <section id="philip-may-data-science-and-it">
<h1>Philip May - Data Science and IT<a class="headerlink" href="#philip-may-data-science-and-it" title="Link to this heading">#</a></h1>
<p>I’m Philip May, data scientist expert and open source enthusiast with an NLP focus.<br />
I come from Germany and work for Deutsche Telekom.</p>
<p>This website is a mixture of documentation, blog and personal notes.</p>
<p><a class="reference internal" href="machine-learning.html#ml-main"><span class="std std-ref">Machine Learning</span></a> ·
<a class="reference internal" href="python.html#python-main"><span class="std std-ref">Python</span></a> ·
<a class="reference internal" href="it.html#it-main"><span class="std std-ref">IT</span></a> ·
<a class="reference internal" href="linux.html#linux-main"><span class="std std-ref">Linux</span></a><br />
<a class="reference internal" href="blog.html#blog-main"><span class="std std-ref">Blog</span></a> ·
<a class="reference internal" href="about-me.html#about-me-main"><span class="std std-ref">About Me</span></a></p>
<div class="toctree-wrapper compound">
</div>
<section id="blog">
<h2>Blog<a class="headerlink" href="#blog" title="Link to this heading">#</a></h2>
<ul class="postlist-style-none postlist simple">
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2024/topic-specific-texts-selection.html">The selection of topic-specific texts from Wikipedia</a> - August 09, 2024</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2024/pandas-data-format-and-compression.html">Pandas Data Format and Compression</a> - July 02, 2024</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2024/importance-of-chat-templates.html">The importance of chat templates</a> - April 11, 2024</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2023/pandas-apply.html">Pandas apply</a> - November 18, 2023</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2022/date-encoding.html">Options for Date Encoding</a> - October 12, 2022</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2022/python-conda-pip.html">Python Installation and Package Management with conda and pip</a> - July 23, 2022</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2022/mlsum-anomalies.html">Anomalies in the MLSUM Dataset</a> - February 23, 2022</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2022/german-wikipedia-corpus-released.html">Clean German Wikipedia Text Corpus released</a> - February 22, 2022</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2022/lightgbm-optuna-demo.html">LightGBM with Optuna: Demo released</a> - February 20, 2022</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2021/gc4-release.html">German colossal, cleaned Common Crawl corpus (GC4) released</a> - April 10, 2021</p></li>
<li class="ablog-post"><p class="ablog-post-title"><a class="reference internal" href="blog/2020/electra-train-eval.html">Training and Evaluation of our German Electra Language Model Talk</a> - December 01, 2020</p></li>
</ul>
</section>
</section>

<div class="section ablog__blog_comments">
   
</div>

                </article>
              
              
              
              
              
            </div>
            
            
              
                <dialog id="pst-secondary-sidebar-modal"></dialog>
                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
<div
    id="pst-page-navigation-heading-2"
    class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> On this page
  </div>
  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#blog">Blog</a></li>
</ul>
  </nav></div>

  <div class="sidebar-secondary-item">

  
  <div class="tocsection editthispage">
    <a href="https://github.com/PhilipMay/may-la-myst/edit/main/source/index.md">
      <i class="fa-solid fa-pencil"></i>
      
      
        
          Edit on GitHub
        
      
    </a>
  </div>
</div>

</div></div>
              
            
          </div>
          <footer class="bd-footer-content">
            
          </footer>
        
      </main>
    </div>
  </div>
  
  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script defer src="_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>

  <footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
  
    <div class="footer-items__start">
      
        <div class="footer-item">

  <p class="copyright">
    
      © Copyright 2020-2025 Philip May.
      <br/>
    
  </p>
</div>
      
    </div>
  
  
  
</div>

  </footer>
  </body>
</html>