<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:blogger='http://schemas.google.com/blogger/2008' xmlns:georss='http://www.georss.org/georss' xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-5789291509148224079</id><updated>2026-03-08T14:11:39.974-07:00</updated><category term="linux"/><category term="kernel"/><category term="linux kernel"/><category term="performance"/><category term="scalability"/><category term="performance-goodies"/><category term="operating systems"/><category term="development"/><category term="x86"/><category term="kvm"/><category term="partition tables"/><category term="architecture"/><category term="block devices"/><category term="concurrency"/><category term="conference"/><category term="cpu"/><category term="efi"/><category term="gpt"/><category term="labels"/><category term="locks"/><category term="memory management"/><category term="plumbers"/><category term="virtualization"/><category term="C"/><category term="C programming"/><category term="Intel"/><category term="LPC 2015"/><category term="SMP"/><category term="TLB"/><category term="VMX"/><category term="algorithms"/><category term="assembler"/><category term="associateve"/><category term="attributes"/><category term="auditing"/><category term="barriers"/><category term="books"/><category term="caches"/><category term="caching"/><category term="compute express link"/><category term="computer science"/><category term="contention"/><category term="cpuid"/><category term="critique"/><category term="cxl"/><category term="data structures"/><category term="disks"/><category term="dos"/><category term="ept"/><category term="fdisk"/><category term="foss.in"/><category term="futex"/><category term="fuzzy testing"/><category term="google summer of code"/><category term="hardware"/><category term="hash tables"/><category term="india"/><category term="limits"/><category term="linux inode filename filesystem symlinks"/><category term="load acquire"/><category term="lpc"/><category term="lslk"/><category term="lslocks"/><category term="master boot record"/><category term="mbr"/><category term="memory"/><category term="memory model"/><category term="mmu"/><category term="numa"/><category term="paging"/><category term="partx"/><category term="prlimit"/><category term="process"/><category term="processor"/><category term="research"/><category term="resources"/><category term="security"/><category term="shadow pages"/><category term="store release"/><category term="stressing software"/><category term="sun"/><category term="synchronization"/><category term="system call"/><category term="systems"/><category term="tags"/><category term="target fuzzing"/><category term="tasks"/><category term="translations"/><category term="trinity"/><category term="ulimit"/><category term="unix"/><category term="userspace mutexes"/><category term="util-linux"/><category term="v4.14"/><category term="v4.15"/><category term="v4.16"/><category term="v4.17"/><category term="v4.18"/><category term="v4.19"/><category term="v4.20"/><category term="v5.0"/><category term="v5.1"/><category term="v5.2"/><title type='text'>Davidlohr Bueso</title><subtitle type='html'></subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default?redirect=false'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default?start-index=26&amp;max-results=25&amp;redirect=false'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>27</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-3709268265882614643</id><published>2023-12-01T11:14:00.000-08:00</published><updated>2023-12-01T11:14:50.019-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="compute express link"/><category scheme="http://www.blogger.com/atom/ns#" term="cxl"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="lpc"/><category scheme="http://www.blogger.com/atom/ns#" term="memory"/><category scheme="http://www.blogger.com/atom/ns#" term="plumbers"/><title type='text'>LPC 2023: CXL Microconference</title><content type='html'>&lt;p style=&quot;text-align: justify;&quot;&gt;The&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/sessions/160/#20231113&quot;&gt;Compute Express Link (CXL) microconference&lt;/a&gt;&amp;nbsp;was held, for a second straight time, at this year&#39;s Linux Plumbers Conference. The goals for the track were to openly discuss current on-going development efforts around the core driver, as well as experimental memory management topics which lead to accommodating kernel infrastructure for new technology and use cases.&lt;/p&gt;&lt;table align=&quot;center&quot; cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;tbody&gt;&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhLnHpM3hQsrkhKM7R4pj0vDJUZKJhfwPNPO1vpFtHIteNAyqUKmvRg8V9vSi1s0vTxZpCWdOkkx3FpuWcULEtfosEyfoDyKeWzwi44a4FZ_XzRMTiq4XMz9ZREeGRECbJm158ljiJVt4DgZZ04W8cMzSvO6wO_K5WFPDaZ4BDxzsSWg75Y8rogzoivgzz6/s4032/IMG_1436.jpg&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; data-original-height=&quot;3024&quot; data-original-width=&quot;4032&quot; height=&quot;197&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhLnHpM3hQsrkhKM7R4pj0vDJUZKJhfwPNPO1vpFtHIteNAyqUKmvRg8V9vSi1s0vTxZpCWdOkkx3FpuWcULEtfosEyfoDyKeWzwi44a4FZ_XzRMTiq4XMz9ZREeGRECbJm158ljiJVt4DgZZ04W8cMzSvO6wO_K5WFPDaZ4BDxzsSWg75Y8rogzoivgzz6/w320-h197/IMG_1436.jpg&quot; title=&quot;CXL session at LPC23&quot; width=&quot;320&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;CXL session at LPC23&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(i)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1454/&quot; style=&quot;text-align: left;&quot;&gt;CXL Emulation in QEMU - Progress, status and most importantly what next?&lt;/a&gt;&lt;span style=&quot;text-align: left;&quot;&gt;&amp;nbsp;The cxl qemu maintainers presented the current state of the emulation, for which significant progress has been made, extending support beyond basic enablement. During this year, features such as volatile devices,&amp;nbsp;CDAT, poison and injection infrastructure have been added upstream qemu, while several others are in the process, such as CCI/mailbox, Scan Media and dynamic capacity. There was also further highlighting of the latter, for which DCD support was presented along with extent management&amp;nbsp;issues found in the 3.0 spec. Similarly, Fabric Management was another important topic, continuing the debate about qemu&#39;s role in FM development, which is still quite early. Concerns about the production (beyond testing) use cases for CCI kernel support were discussed, as well as semantics and interfaces that constrain qemu, such as host and switch coupling and differences with BMC behavior.&lt;/span&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(ii)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1453/&quot; style=&quot;text-align: left;&quot;&gt;CXL Type-2 core support&lt;/a&gt;&lt;span style=&quot;text-align: left;&quot;&gt;. The state and purpose of existing experimental support for type 2 (accelerators) devices was presented, for both the kernel and qemu sides. The kernel support led to preliminary&amp;nbsp;abstraction improvement work being upstreamed, facilitating actual accelerator&amp;nbsp;integration with the cxl core driver. However, the rest is merely guess work and the floor is open for an actual hardware backed proposal. In addition, HDM-DB support would also be welcomed as a step forward.&amp;nbsp;&lt;/span&gt;The qemu side is very basic and designed to just exercise&amp;nbsp;core checks, for which it&#39;s emulation should be limited, specially in light of cxl_test.&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(iii)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1457/&quot;&gt;Plumbing challenges in Dynamic capacity device&lt;/a&gt;. An in-depth coverage and discussion, from a kernel side, of the state of DCD support and considerations around corner cases. Semantics of releasing DC for full partial extents (ranges) are two different beasts altogether. Releasing all the already given&amp;nbsp; memory can simply require memory being offline and be done, avoiding unnecessary complexity in the kernel. Therefore the kernel can perfectly well reject the request, and FM design should keep that into consideration. Partial extents, on the other hand, are unsupported for the sake of simplicity, at least until a solid industry use case comes along. Forced DC removal of online memory semantics were also discussed, emphasizing that such DC memory is not guaranteed to ever be given back by the kernel, mapped or not. Forcing the event, the hardware does not care and the kernel has most likely crashed anyway. Support for extent tagging was another topic, establishing the need for supporting it, coupling a device to a tag domain, being a sensible use case. For now at least, the implementation can be kept to to simply enumerate tags and the necessary attributes to leave the memory matching to userspace, instead of more complex surgeries to create DAX devices on specific extents, dealing with sparse regions.&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(iv)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1452/&quot; style=&quot;text-align: left;&quot;&gt;Adding RAS Support for CXL Port Devices&lt;/a&gt;&lt;span style=&quot;text-align: left;&quot;&gt;. Starting with a general overview of RAS, t&lt;/span&gt;&lt;span style=&quot;text-align: left;&quot;&gt;his touched on the current state for support in CXL 1.1 and 2.0.&amp;nbsp; Special handling is required for RCH:&amp;nbsp;&lt;/span&gt;&lt;span&gt;d&lt;/span&gt;&lt;span&gt;ue to the RCRB implementation, the RCH downstream port does not have a BDF&lt;/span&gt;&lt;span&gt;, needed for AER error handling; this work was merged in v6.7. As for CXL Virtual Hierarchy implementation, it is left still open, potentially things could move away from the PCIe port service driver model, which is not entirely liked. There are however, clear requirements: not-CXL specific (AER is a PCIe protocol, used by CXL.io); implement driver callback logic specific to that technology or device, giving flexibility to handle that specific need; and allow enable/disable on a per-device granularity. There were discussions around the order for which a registration handler is added in the PCI port driver, noting that it made sense to go top-down from the port and searching children, instead of written from a lower level.&lt;/span&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;span style=&quot;text-align: left;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(v)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1455/&quot; style=&quot;text-align: left;&quot;&gt;Shared CXL 3 memory: what will be required?&lt;/a&gt;&amp;nbsp;Overview of the state, semantics and requirements for supporting shared fabric attached memory (FAM). A strong enablement use case is leveraging applications that already handle data sets in files. In addition appropriate workload candidates will fit the &quot;master writer, multiple readers&quot; read-only model for which this sort of machinery would make sense. Early results show that the benefits can out-weigh costly remote CXL memory access such as fitting larger data sets in FAM that would otherwise be possible in a single host. Similarly this avoids cache-coherency costs by simply never modifying the memory. A number of concrete data science and AI usecases were presented. Shared FAM is meant to be mmap-able, file-backed, special purpose memory, for which a FAMFS prototype is described, overcoming limitations of just using DAX device/FSDAX, such as distributing metadata in a shareable way.&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(vi)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1458/&quot; style=&quot;text-align: left;&quot;&gt;CXL Memory Tiering for heterogenous computing&lt;/a&gt;&lt;span style=&quot;text-align: left;&quot;&gt;. Discusses the pros and cons of interleaving heterogeneous (ie: DRAM and CXL) memory through hardware and/or software for bandwidth optimization. Hardware interleaving is simple to configure through the BIOS, but limited by not allowing the OS to manage allocations, otherwise hiding the NUMA topology (single node) as well as being a static configuration. The software interleaving solves these limitations with hardware and relies on weighted nodes for allocation distribution when doing the initial mapping (vma). Several interfaces have been posted, which incrementally are converging into a NUMA node based interface. The caveat is to have a single (configurable) system-wide set of weights, or to allow more flexibility, such as hierarchically through cgroups - something which has not been particularly sold yet. Combining both hardware and software models relies on within a socket, splitting channels among respective DDR and CXL NUMA nodes for which software can explicitly (numactl) set the interleaving - it is still restrained however by being static as the BIOS is in charge of setting the number of NUMA nodes.&amp;nbsp;&lt;/span&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;&lt;span style=&quot;text-align: left;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;(vii)&amp;nbsp;&lt;a href=&quot;https://lpc.events/event/17/contributions/1456/&quot; style=&quot;text-align: left;&quot;&gt;A move_pages() equivalent for physical memory&lt;/a&gt;&lt;span style=&quot;text-align: left;&quot;&gt;. Through an experimental interface, this focused on the semantics of tiering and device driven page movement. There are currently various mechanisms for access detection, such as PMU-based, fault hinting for page promotion and idle bit page monitoring; each with its set of limitations, while runtime overhead is a universal concern. Hardware mechanisms could help with the burden but the problem is that devices only know physical memory and must therefore do expensive reverse mapping lookups; nor are there any interfaces for this, and it is difficult to with out hardware standardization. A good starting point would be to keep the suggested move_phys_pages as an interface, but not have it be an actual syscall.&lt;/span&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/3709268265882614643/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2023/12/lpc-2023-cxl-microconference.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3709268265882614643'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3709268265882614643'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2023/12/lpc-2023-cxl-microconference.html' title='LPC 2023: CXL Microconference'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhLnHpM3hQsrkhKM7R4pj0vDJUZKJhfwPNPO1vpFtHIteNAyqUKmvRg8V9vSi1s0vTxZpCWdOkkx3FpuWcULEtfosEyfoDyKeWzwi44a4FZ_XzRMTiq4XMz9ZREeGRECbJm158ljiJVt4DgZZ04W8cMzSvO6wO_K5WFPDaZ4BDxzsSWg75Y8rogzoivgzz6/s72-w320-h197-c/IMG_1436.jpg" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-1793401371820330574</id><published>2019-09-10T12:26:00.002-07:00</published><updated>2019-09-10T12:26:59.854-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="operating systems"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v5.2"/><title type='text'>Linux v5.2: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
locking/rwsem: optimize trylocking for the uncontended case&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This applies the idea that in most cases, a rwsem will be uncontended (single threaded). For example, experimentation showed that page fault paths really expect this. The change itself makes the code basically not read in a cacheline in a tight loop over and over. Note however that this can be a double edged sword, as microbenchmarks have show performance deterioration upon high amounts of tasks, albeit mainly pathological workloads.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ddb20d1d3aed8f130519c0a29cd5392efcc067b8&quot;&gt;ddb20d1d3aed&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a338ecb07a338c9a8b0ca0010e862ebe598b1551&quot;&gt;a338ecb07a33&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/lockref: limit number of cmpxchg loop retries&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Unbounded loops are rather froned upon, specially ones ones doing CAS operations. As such, Linus suggested adding an arbitrary upper bound to the loop to force the slowpath (spinlock fallback), which was seen to improve performance on an adhoc testcase on hardware that incurrs in the loop retry game.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=893a7d32e8e04ca4d6c882336b26ed660ca0a48d&quot;&gt;893a7d32e8e0&lt;/a&gt;]&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
rcu: avoid unnecessary softirqs when system is idle&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Upon an idle system with no pending callbacks, rcu sofirqs to process callbacks were being triggered repeatedly. Specifically the mismatch between &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;cpu_no_qs&lt;/span&gt; and &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;&lt;i&gt;core_need_rq&lt;/i&gt;&lt;/span&gt; was addressed.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=671a63517cf983ad8eaa324167165cef245ab744&quot;&gt;671a63517cf9&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
rcu: fix potential cond_resched() slowdowns&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When using the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;jiffies_till_sched_qs&lt;/span&gt; kernel boot parameter, a bug made&lt;i&gt; &lt;/i&gt;&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;jiffies_to_sched_qs&lt;/span&gt; become uinitialized as zero and therefore impacts negatively in &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;cond_resched()&lt;/span&gt;.&lt;i&gt;&lt;br /&gt;&lt;/i&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6973032a602ee678c98644a30d57ebf9c72dd6d3&quot;&gt;6973032a602e&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: improve vmap allocation&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Doing a vmalloc can be quite slow at times, and with it being done with preemption disabled, can affect workloads that are sensible to this. The problem relies in the fact that a new VA area is done over a busy list iteration until a suitable hole is found between two busy areas. The changes propose the always reliable red-black tree to keep blocks sorted by their offsets along with a  list keeping the free space in order of increasing addresses. &lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=68ad4a3304335358f95a417f2a2b0c909e5119c4&quot;&gt;68ad4a330433&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=68571be99f323c3c3db62a8513a43380ccefe97c&quot;&gt;68571be99f32&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/gup: safe usage of get_user_pages_fast() with DAX&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Users of &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;get_user_pages_fast()&lt;i&gt; &lt;/i&gt;&lt;/span&gt;have potential performance benefits compared to its non-fast cousin, by avoiding mmap_sem, than it&#39;s non-fast equivalent. However drivers such as rdma can pin these pages for a significant amount of time, where a number of issues come with the filesystem as referenced pages will block a number of critical operations and is known to &lt;a href=&quot;https://lwn.net/Articles/784574/&quot;&gt;mess up DAX&lt;/a&gt;. A new &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;FOLL_LONGTERM&lt;/span&gt; flag is added and checked accordingly; which also means that other users such as xdp can now also be converted to gup_fast.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=932f4a630a695212bdc7379b05f9bd0dafc5d968&quot;&gt;932f4a630a69&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b798bec4741bdd80224214fdd004c8e52698e425&quot;&gt;b798bec4741b&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73b0140bf0fe9df90fb267c00673c4b9bf285430&quot;&gt;73b0140bf0fe&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7af75561e17132b20b5bc047d222f34b3e7a3e6e&quot;&gt;7af75561e171&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9fdf4aa156733e3f075a9d7d0b026648b3874afe&quot;&gt;9fdf4aa15673&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=664b21e717cfe4781137263f2555da335549210e&quot;&gt;664b21e717cf&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f3b4fdb18cb51bd6ca2c245fbe630ccbea95b3c9&quot;&gt;f3b4fdb18cb5&lt;/a&gt; ]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/sort: faster and smaller&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Because &lt;i&gt;CONFIG_RETPOLINE&lt;/i&gt; has made indirect calls much more expensive, these changes reduce the number made by the library sort functions, &lt;i&gt;lib/sort&lt;/i&gt; and &lt;i&gt;lib/list_sort&lt;/i&gt;. A number of optimizations and clever tricks are used such as a more efficient bottom up heapsort and playing nicer with store buffers.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37d0ec34d111acfdb82b24e3de00d926c0aece4d&quot;&gt;37d0ec34d111&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22a241ccb2c19962a0fb02c98154aa93d3fc1862&quot;&gt;22a241ccb2c1&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8fb583c4258d08f0aff105aa2ae5157b7d414ea2&quot;&gt;8fb583c4258d&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=043b3f7b6388fca6be86ca82979f66c5723a0d10&quot;&gt;043b3f7b6388&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b5c56e0cdd62979dd538e5363b06be5bdf735a09&quot;&gt;b5c56e0cdd62&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
ipc/mqueue: make msg priorities truly O(1)&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
By keeping the pointer to the tree&#39;s rightmost node, the process of consuming a message can be done in constant time, instead of logarithmic.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a5091fda4e3c202aeb1728a86d0fcd20fd0f4f5e&quot;&gt;a5091fda4e3c&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
x86/fpu: load FPU registers on return to userland&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This is a  large, 27-patch, cleanup and optimization to only load fpu registers on return to userspace, instead of upon every context switch. This means that tasks that remain in kernel space do not load the registers. Accessing the fpu registers in the kernel requires disabling preemption and bottom-halfs&amp;nbsp; for scheduler and softirqs, accordingly. &lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2722146eb78451b30e4717a267a3a2b44e4ad317&quot;&gt;2722146eb784&lt;/a&gt; ... &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a5eff7259790d5314eff10563d6e59d358cce482&quot;&gt;a5eff7259790&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
x86/hyper-v: implement EOI optimization&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Avoid a vmexit on EOI. This was seen to slightly improve IOPS when testing nvme disks with raid and ext4.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ba696429d290690db967e5f49463df4b2c1314a4&quot;&gt;ba696429d290&lt;/a&gt;]&lt;/div&gt;
&amp;nbsp; &lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
btrfs: improve performance on fsync of files with multiple hardlinks&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A fix to a performance regression seen in pgbench which can make fsync a full transaction commit in order to avoid losing hard links and new ancestors of the fsynced inode.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8aa330d2acb122563be87c42d82c5c8649cf658&quot;&gt;b8aa330d2acb&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fsnotify: fix unlink performance regression&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This restores an unlink performance optimization that avoids &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;take_dentry_name_snapshot()&lt;/span&gt;.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4d8e7055a4058ee191296699803c5090e14f0dff&quot;&gt;4d8e7055a405&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
block/bfq: do not merge queues on flash storage with queuing&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Disable queue merging on non-rotational devices with internal queueing, thus boosting throughput on interleaved IO.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8cacc5ab3eacf5284bc9b0d7d5b85b748a338104&quot;&gt;8cacc5ab3eac&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/1793401371820330574/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2019/09/linux-v52-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1793401371820330574'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1793401371820330574'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2019/09/linux-v52-performance-goodies.html' title='Linux v5.2: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-4374100953442705541</id><published>2019-05-09T13:10:00.001-07:00</published><updated>2019-05-09T13:10:23.652-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="operating systems"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v5.1"/><title type='text'>Linux v5.1: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched/wake_q: reduce atomic operations for special users&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Some core users of &lt;i&gt;wake_qs&lt;/i&gt;,
 futex and rwsems were incurring in double task reference counting - 
which was a side effect for safety reasons. This change levels the 
call&#39;s performance with the rest of the users.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=07879c6a3740fbbf3c8891a0ab484c20a12794d8&quot;&gt;07879c6a3740&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
irq: Speedup for interrupt statistics in /proc/stat&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
On large systems with a large amount of interrupts the
readout of &lt;i&gt;/proc/stat &lt;/i&gt;takes a long time to sum up the interrupt
statistics.&amp;nbsp;
The reason for this is that interrupt statistics are accounted per cpu. So
the &lt;i&gt;/proc/stat &lt;/i&gt;logic has to sum up the interrupt stats for each interrupt. While applications shouldn&#39;t really be doing this to a point where it creates bottlenecks, the fix was fairly easy. &lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1136b0728969901a091f0471968b2b76ed14d9ad&quot;&gt;1136b0728969&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/swapoff: replace quadratic complexity for lineal&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;i&gt;try_to_unuse() &lt;/i&gt;is of quadratic complexity, with a lot of wasted effort.
It unuses swap entries one by one, potentially iterating over all the
page tables for all the processes in the system for each one. With these changes, it now iterates over the system&#39;s mms once, unusing
all the affected entries as it walks each set of page tables.&lt;br /&gt;
&lt;br /&gt;
Improvements show time reductions for &lt;i&gt;swapoff&lt;/i&gt; being called on a swap partition containing about 6G of data, from 8 to 3 minutes. &lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5bf121e4350a933bd431385e6fcb72a898ecc68&quot;&gt;c5bf121e4350&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b56a2d8af9147a4efe4011b60d93779c0461ca97&quot;&gt;b56a2d8af914&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&amp;nbsp;
mm: make pinned_vm an atomic counter&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This reduces some of the bulky &lt;i&gt;mmap_sem&lt;/i&gt; games that are played when, mostly rdma, deals with the pinned pages counter. It also pivots on not relying on the lock for &lt;i&gt;get user pages&lt;/i&gt; operations.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7&quot;&gt;70f8a3ca68d3&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3a2a1e90564e3ad215aa5c6ddc0e741cd6208a93&quot;&gt;3a2a1e90564e&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b95df5e3e45914c679fa5d4ca08abdd1c98b9f50&quot;&gt;b95df5e3e459&lt;/a&gt;] &lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
drivers/async: NUMA aware async_schedule calls&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div class=&quot;content&quot;&gt;
Asynchronous function calls reduces, primarily, kernel boot time by safely doing out of order operations, such as device discovery. This series improves the NUMA
locality by being able to
schedule device specific init work on specific NUMA nodes in order to
improve performance of memory initialization. Significant init reduction times for persistent
memory were seen.&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3451a495ef244a88ed6317a035299d835554d579&quot;&gt;3451a495ef24&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ed88747c6c4a2fc2f961a36d4c50cb0868c30229&quot;&gt;ed88747c6c4a&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ef0ff68351be4fd83bec2d797f0efdc0174a55a4&quot;&gt;ef0ff68351be&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8204e0c1113d6b7f599bcd7ebfbfde72e76c102f&quot;&gt;8204e0c1113d&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6be9238e5cb64741ff95c3ae440b112753ad93de&quot;&gt;6be9238e5cb6&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c37e20eaf4b21125898fd454f3ea6b212865d0a6&quot;&gt;c37e20eaf4b2&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8b9ec6b732775849f506aa6c2649e626e82a297c&quot;&gt;8b9ec6b73277&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=af87b9a7863c7bb47f8bd015c0ce4a37d70c5225&quot;&gt;af87b9a7863c&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=57ea974fb8717864e8b7ec679363c5a3298a165e&quot;&gt;57ea974fb871&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/iov_iter: optimize page_copy_sane()&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div class=&quot;content&quot;&gt;
This avoid cacheline misses when dereferencing a &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;struct page&lt;/span&gt;, via &lt;i&gt;compound_head()&lt;/i&gt;, when possible. Apparently the overhead was visible on TCP doing &lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;i&gt;recvmsg() &lt;/i&gt;&lt;/span&gt;calls dealing with GRO packets.&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6daef95b8c914866a46247232a048447fff97279&quot;&gt;6daef95b8c91&lt;/a&gt;]&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs/epoll: reduce lock contention in ep_poll_callback()&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This patch
increases the bandwidth of events which can be delivered from sources to
the poller by adding poll items in a lockless way to the ready list; via clever ways of &lt;i&gt;xchg() &lt;/i&gt;while holding a reader &lt;i&gt;rwlock&lt;/i&gt; . This improves &lt;a href=&quot;https://github.com/rouming/test-tools/blob/master/stress-epoll.c&quot;&gt;scenarios&lt;/a&gt; with multiple threads generating IO events&lt;span class=&quot;pl-c&quot;&gt; which are delivered to a single threaded &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;epoll_wait()&lt;/span&gt;er.&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c141175d011f18252abb9aa8b018c4e93c71d64b&quot;&gt;c141175d011f&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c3e320b61581ef7919269ca242ff13951ccfc763&quot;&gt;c3e320b61581&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a218cc4914209ac14476cb32769b31a556355b22&quot;&gt;a218cc491420&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs/nfs: reduce cost of listing huge directories (readdirplus)&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When listing very large directories via NFS, clients may take a long
time to complete. Most of the culprit is in various degrees of libc&#39;s &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;readdir(2)&lt;/span&gt; reading 32k files at a time. To improve performance and reduce the amount of rpc calls, NFS
readdirplus rpc will ask for a large data (more than 32k), the data can
fill more than one page, the cached pages can be used for next readdir
call. Benchmarks show rpc calls decreasing by 85% while listing a directory with 300k files.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be4c2d4723a4a637f0d1b4f7c66447141a4b3564&quot;&gt;be4c2d4723a4&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs/pnfs: Avoid read/modify/write when it is not necessary&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When testing with &lt;i&gt;fio&lt;/i&gt;, Throughput of overwrite (both buffered and O_SYNC) is noticeably
improved.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=97ae91bbf3a70fc8cee3c9030564cfc892cc8cee&quot;&gt;97ae91bbf3a7&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cde04e90d5be46b4b6655b965b496e6b6f18e49&quot;&gt;2cde04e90d5b&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/4374100953442705541/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2019/05/linux-v51-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/4374100953442705541'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/4374100953442705541'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2019/05/linux-v51-performance-goodies.html' title='Linux v5.1: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-3604346424115444678</id><published>2019-05-09T13:10:00.000-07:00</published><updated>2019-05-09T13:10:08.902-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="operating systems"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v5.0"/><title type='text'>Linux v5.0: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/page-alloc: reduce zone-&amp;gt;lock contention&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Contention in the page allocator was seen in a network traffic report, in which order-0 allocations are being freed by back to the directly to the buddy, instead of making use of percpu-pages in the &lt;i&gt;page_frag_free()&lt;/i&gt; call. Aside from eliminating the contention, it was seen to improve some microbenchmarks.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=65895b67ad27df0f62bfaf82dd5622f95ea29196&quot;&gt;65895b67ad27&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/mremap: improve scalability on large regions&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When THP is disabled, &lt;i&gt;move_page_tables()&lt;/i&gt; can bottleneck a large &lt;i&gt;mremap()&lt;/i&gt; call, as it will copy each pte at a time. This patch speeds up the performance by copying at the PMD level when possible. Up to 20x speedups were seen when doing a 1Gb remap.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2c91bd4a4e2e530582d6fd643ea7b86b27907151&quot;&gt;2c91bd4a4e2e&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: improve anti-fragmentation&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Given sufficient time or an adverse workload, memory gets fragmented and the long-term success of high-order allocations degrades. 
Overall the series reduces external fragmentation causing events by over 94%
on 1 and 2 socket machines, which in turn impacts high-order allocation
success rates over the long term.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6bb154504f8b496780ec53ec81aba957a12981fa&quot;&gt;6bb154504f8b&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a921444382b49cc7fdeca3fba3e278bc09484a27&quot;&gt;a921444382b4&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a79cdad5eb213b3a629e624565b1b3bf9192b7c&quot;&gt;0a79cdad5eb2&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1c30844d2dfe272d58c8fc000960b835d13aa2ac&quot;&gt;1c30844d2dfe&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/hotplug: optimize clear hw_poisoned_pages()&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
During hotplug remove, the kernel will loop for the respective number of pages looking for poisoned pages. Check the atomic hint in case this are none, and optimize the function.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5eb570a8d9248e0c1358078a59916d0e337e695b&quot;&gt;5eb570a8d924&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/ksm: Replace jhash2 with xxhash&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;i&gt;xxhash&lt;/i&gt; is an extremely fast non-cryptographic hash algorithm for checksumming, making it suitable to use in kernel samepage merging. On a custom KSM benchmark, throughput was seen to improve from 1569 to 8770 MB/s.&lt;br /&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&amp;nbsp;[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b9df58b79fa283fbedc0fb6a8e248599444bacc&quot;&gt;0b9df58b79fa&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=59e1a2f4bf83744e748636415fde7d1e9f557e05&quot;&gt;59e1a2f4bf83&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
genirq/affinity: Spread IRQs to all available NUMA nodes &lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts
which are allocated for a device, the interrupt affinity spreading code
fails to spread them across all nodes. NUMA nodes above the number of interrupts are all assigned
to hardware queue 0 and therefore NUMA node 0, which results in bad
performance and has CPU hotplug implications. Fix this by assigning via round-robin.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b82592199032bf7c778f861b936287e37ebc9f62&quot;&gt;b82592199032&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs/epoll: Optimizations for epoll_wait()&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Various performance changes oriented towards improving the waiting side, such that contention epoll waitqueue (previously &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;ep-&amp;gt;lock&lt;/span&gt;) spinlock is reduced. This produces pretty good results for various concurrent &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;epoll_wait(2)&lt;/span&gt; benchmarks. &lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=74bdc129850c32eaddc625ce557da560303fbf25&quot;&gt;74bdc129850c&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4e0982a00564c80cb849a892043450860ef91e14&quot;&gt;4e0982a00564&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76699a67f3041ff4c7af6d6ee9be2bfbf1ffb671&quot;&gt;76699a67f304&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=21877e1a5b520132f54515f8835c963056418b4c&quot;&gt;21877e1a5b52&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=c5a282e9635e9c7382821565083db5d260085e3e&quot;&gt;c5a282e9635e&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=abc610e01c663e25c41a3bdcbc4115cd7fbb047b&quot;&gt;abc610e01c66&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=86c051793b4c941ee4481725d57cf2a27f6b3aaf&quot;&gt;86c051793b4c&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/sbitmap: Various optimizations&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Two optimizations to the sbitmap core were introduced, which is used, for example, by the block-mq tags. The first optimizes wakeup checks and adds to the core api, while the second introduces batched clearing of bits, trading 64 atomic bitops for 2 &lt;i&gt;cmpxchg&lt;/i&gt; calls.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5d2ee7122c73be6a3b6bfe90d237e8aed737cfaa&quot;&gt;5d2ee7122c73&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ea86ea2cdced20057da4d2c32965c1219c238197&quot;&gt;ea86ea2cdced&lt;/a&gt;]&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&lt;/h4&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;h4 style=&quot;text-align: justify;&quot;&gt;
fs/locks: Avoid thundering herd wakeups&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When one thread releases a lock on a given file, it wakes up all other threads that
are waiting (classic thundering-herd) - one will get the lock and the
others go to sleep.&amp;nbsp;
The overhead  starts being noticeable with increasing thread counts. These changes create a tree of pending lock request in which siblings
don&#39;t conflict and each lock request does conflict with its parent.
When a lock is released, only requests which don&#39;t conflict with each
other a woken.&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Testing shows that lock-acquisitions-per-second is now fairly stable even
as number of contending process goes to 1000.  Without this patch,
locks-per-second drops off steeply after a few 10s of processes. Micro-benchmarks can be found per the &lt;a href=&quot;https://github.com/mwilck/lockscale&quot;&gt;lockscale&lt;/a&gt; program, which tests &lt;code&gt;fcntl(..., F_OFD_SETLKW, ...) &lt;/code&gt;and &lt;code&gt;&lt;code&gt;flock(..., LOCK_EX) &lt;/code&gt;&lt;/code&gt;calls.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6367d6241371566597c9ab6efe4de0abf254eed&quot;&gt;d6367d624137&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5946c4319ebb39af17fb9d6a606c866ce9b88740&quot;&gt;5946c4319ebb&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16306a61d3b7c433c7a127ec6224867b88ece687&quot;&gt;16306a61d3b7&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0e15908979d269a8263b0c0a222b894b9f403e9&quot;&gt;c0e15908979d&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=fd7732e033e30b3a586923b57e338c859e17858a&quot;&gt;fd7732e033e3&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cb03f94ffb070b13bc0fa58b4ef4fdb558418d27&quot;&gt;cb03f94ffb07&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
arm64/lib: improve crc32 performance for deep pipelines&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This change replace most branches with a branchless code path that overlaps 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=efdb25efc7645b326cd5eb82be5feeabe167c24e&quot;&gt;efdb25efc764&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/3604346424115444678/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2019/05/linux-v50-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3604346424115444678'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3604346424115444678'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2019/05/linux-v50-performance-goodies.html' title='Linux v5.0: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-810141038964555693</id><published>2019-02-24T15:53:00.001-08:00</published><updated>2019-02-24T15:53:36.200-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.20"/><title type='text'>Linux v4.20: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With v4.20 out for almost the entire v5.0 rc-cycle, here are some of the more interesting performance related changes that made their way in.&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
signal: Use a smaller struct siginfo in the kernel&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Reduces the memory footprint of &#39;&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;struct siginfo&lt;/span&gt;&#39; most of which is just reserved. Ultimately this avoid spanning two cachelines to just one.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4ce5f9c9e7546915c559ffae594e6d73f918db00&quot;&gt;4ce5f9c9e754&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched/fair: Fix cpu_util_wake() for &#39;execl&#39; type workloads&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Fix an exec() related performance regression, which was caused by incorrectly calculating load and migrating tasks on exec() when  they shouldn&#39;t be. &lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c469933e772132aad040bd6a2adc8edf9ad6f825&quot;&gt;c469933e7721&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
locking/rwsem: Exit read lock slowpath if queue empty and no writer&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This change presents a new heuristic for optimizing rw-semaphores, specifically in read-mostly scenarios. Before the patch, a reader could find itself in a situation when it was in the slowpath, due to an occasional writer thread, but the writer was then released, and only other readers are now present.&amp;nbsp; At that point the waitqueue was enlarged unnecessarily, causing other readers attempting to lock to see waiting readers. This directly improves some issues found when (ab)using &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;pread64()&lt;/span&gt; and XFS.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4b486b535c33ef354ecf02a2650919004fd7d2b0&quot;&gt;4b486b535c33&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: mmap: zap pages with read mmap_sem in munmap&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When a process unmaps a range of memory, the infamous &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;mmap_sem&lt;/span&gt; would
held for the duration of the entire &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;munmap()&lt;/span&gt; call, which can be a long time for
big mappings (reportedly up to 18 seconds for a 320Gb mapping).&amp;nbsp; A two-phase approach was done to address this where the key is to unmap the vma first such that the semaphore can be taken exclusively at first then downgrade it such that it can be shared while doing the zapping and freeing of page tables.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dd2283f2605e3b3e9c61bcae844b34f2afa4813f&quot;&gt;dd2283f2605e&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b4cefb36051244bcb5651026d862c332a6cac7df&quot;&gt;b4cefb360512&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cb4922496ae40a775a1b17025eaa1060e8991253&quot;&gt;cb4922496ae4&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
net/tcp: optimize tcp internal pacing&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When TCP implements its own pacing (when no fq packet scheduler is used), it is arming high resolution timer after a packet is sent. But in many cases (like TCP_RR kind of workloads), this high resolution timer expires before the application attempts to write the following packet. Setup the timer only when a packet is about to be sent, and if tcp_wstamp_ns is in the future,&amp;nbsp; showing a ~10% performance increase in TCP_RR workloads.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=864e5c090749448e879e86bec06ee396aa2c19c5&quot;&gt;864e5c090749&lt;/a&gt;]&lt;br /&gt;
&amp;nbsp; &lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs: better member layout of struct super_block&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Re-organize &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;&#39;struct super_block&#39;&lt;/span&gt; to try and keep some frequently accessed fields on the same cache line as well as grouping the rarely accessed members. This was seen to address a regression on a concurrent &lt;i&gt;unlink&lt;/i&gt; intensive workload.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=99c228a994ec8b1580c43631866fd2c5440f5bfd&quot;&gt;99c228a994ec&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
fs/fuse: improved scalability &lt;/h4&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Two changes that have performance visible effects went in. The first series changes some of the protections for background requests. This allows async reads not take the fuseconn lock. Secondly implement a hash table for processing requests which was seen to address a 20% time spent in &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;request_find()&lt;/span&gt; under some workloads with Virtuozzo storage over rdma.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e287179afe2190faa7b97915cb89215dde5e044b&quot;&gt;e287179afe21&lt;/a&gt;  &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a23f2b8adbe4bd584f936f7ac17a99750eed9d7&quot;&gt;2a23f2b8adbe&lt;/a&gt;  &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2b30a533148af4f3865c0dcd619ad93ab3f4ba52&quot;&gt;2b30a533148a&lt;/a&gt;  &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ae2dffa39485c6fd4f22321814c7287c274b473a&quot;&gt;ae2dffa39485&lt;/a&gt;  &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=63825b4e1da5a3cba79d835a5925e5daf7db3a77&quot;&gt;63825b4e1da5&lt;/a&gt; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c59fd85e4fd07fdf0ab523a5e9734f5338d6aa19&quot;&gt;c59fd85e4fd0&lt;/a&gt;  &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be2ff42c5d6ebc8552c82a7d1697afae30510ed9&quot;&gt;be2ff42c5d6e&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/810141038964555693/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2019/02/linux-v420-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/810141038964555693'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/810141038964555693'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2019/02/linux-v420-performance-goodies.html' title='Linux v4.20: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-7780282064355244285</id><published>2018-10-25T11:19:00.000-07:00</published><updated>2018-10-25T11:19:31.587-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.19"/><title type='text'>Linux v4.19: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This post marks one year since I began doing these kernel performance goodies write ups,&amp;nbsp; &lt;a href=&quot;https://blog.stgolabs.net/2017/11/linux-v414-performance-goodies.html&quot;&gt;starting from v4.14&lt;/a&gt;. And this week Greg released Linux
 v4.19, so here are some of the changes related to software optimizations, performance and scalability topics across various subsystems.&lt;br /&gt;
&lt;br /&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
epoll: loosen irq safety when possible&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The epoll code uses an irq-safe spinlock to protect concurrent operations to the ready-event linked list. However, with the exception of the callback done from the wakequeues, the calls to the spinlock are never done in irq context, and therefore there is really no need to save and restore interrupts each time the lock is acquired and released. For example, on x86, a &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;POPF&lt;/span&gt; (irqrestore) instruction can be quite expensive as it changes all the flags and therefore potentially heavy on dependencies. These changes yield some measurable results on a range of &lt;i&gt;epoll_wait(2)&lt;/i&gt; microbenchmarks, around 7-20% in raw throughput. This is unsurprising as &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;PUSHF + POPF&lt;/span&gt; is&amp;nbsp; more expensive than &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;STI + CLI&lt;span style=&quot;font-family: inherit;&quot;&gt;.&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=002b343669c474151954266e7fcf727bf7faa851&quot;&gt;002b343669c4&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=304b18b8d6af796c8ece221d34c92aeb1559789b&quot;&gt;304b18b8d6af&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=92e641784055998879942d39c74d4f84fa750968&quot;&gt;92e641784055&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=679abf381a18e945457b01921f667cee9e656a7f&quot;&gt;679abf381a18&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched/numa:&amp;nbsp; migrate pages to local nodes quicker early in the lifetime of a task&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Automatic NUMA Balancing uses a multi-stage pass to decide whether a page should migrate to a local node. This filter avoids excessive ping-ponging if a page is shared or used by threads that migrate cross-node frequently. Threads inherit both page tables and the preferred node ID from the parent. This means that threads can trigger hinting faults earlier than a new task which delays scanning for a number of seconds. As it can be load balanced very early in its lifetime there can be an unnecessary delay before it starts migrating thread-local data. This patch migrates private pages faster early in the lifetime of a thread using the sequence counter as an identifier of new tasks.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37355bdc5a129899f6b245900a8eb944a092f7fd&quot;&gt;37355bdc5a12&lt;/a&gt;]&lt;br /&gt;
&amp;nbsp; &lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
rcu: check if GP already requested&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This commit makes &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;rcu_nocb_wait_gp()&lt;/span&gt; check to see if the current CPU already knows about the needed grace period having already been requested.&amp;nbsp; If so, it avoids acquiring the corresponding leaf rcu_node structure&#39;s lock, thus decreasing contention.&amp;nbsp; This optimization is intended for cases where either multiple leader rcu kthreads are running on the same CPU or these kthreads are running on a non-offloaded (e.g., housekeeping) CPU.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ab5e869c1f7aa30a1210f5e8a277758b0599609f&quot;&gt;ab5e869c1f7a&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
cpufreq/schedutil: take into account time spent in irq&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Time being spent in interrupt handlers was not being accounted for in the CPU utilization when selecting an operating performance point. This can be a significant amount of time which is reported in the normal context time window. The new CPU utilization is yields a 10% performance boost on &lt;i&gt;iperf&lt;/i&gt; workloads.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9033ea11889f88f243445495f72441e22256d5e9&quot;&gt;9033ea11889f&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/page_alloc: enlarge zone&#39;s batch size&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The page allocator will first try to use a percpu set of pages, then if all used up, ask the Buddy for a batch of pages. The size of this batch can have a number of consequences, including performance. The last time this magic number was increased was 13 years ago, and there have been numerous hardware improvements since then. As such a recent study with allocator intensive benchmarks, shows that doubling the size of the batch can yield improvements on larger/modern machines.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8a759b5703519d37fa5b752f825cbfc06b57906&quot;&gt;d8a759b57035&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: skip invalid pages block at a time in zero_resv_unresv()&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The role of zero_resv_unavail() is to make sure that every struct page that is allocated but is not backed by memory that is accessible by kernel is zeroed and not in some uninitialized state. Since struct pages are allocated in blocks we can skip pageblock_nr_pages at a time, when the first one is found to be invalid. This optimization may help since now on x86 every hole in e820 maps is marked as reserved in memblock, and thus will go through this function.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=720e14ebec642bc56c44e5e60a2d595900e5bbf0&quot;&gt;720e14ebec64&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
kvm, x86: implement paravirt &quot;send IPI&quot; hypercall&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Replace sending IPIs one by one for xAPIC physical mode by a single hypercall (vmexit). This patchset lets a guest send multicast IPIs, with at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode. An IPI microbenchmark shows non-trivial performance improvements for broadcast IPIs (send IPI to all online CPUs and force them to take/drop a spinlock).&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4180bf1b655a791a0a6ef93a2ffffc762722c782&quot;&gt;4180bf1b655a&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
arm64: use queued spinlocks&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Similar
 to x86, replace the old ticket spinlocks with fair qspinlocks and make 
use of MCS features as well as better performance under virtualization. 
This is particularly suitable for larger multicore machines.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c11090474d70590170cf5fa6afe85864ab494b37&quot;&gt;c11090474d70&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/7780282064355244285/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2018/10/linux-v419-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7780282064355244285'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7780282064355244285'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2018/10/linux-v419-performance-goodies.html' title='Linux v4.19: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-6461425334288193951</id><published>2018-10-15T13:19:00.002-07:00</published><updated>2018-10-15T13:19:36.873-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="performance-goodies"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.18"/><title type='text'>Linux v4.18: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Linux v4.18 has been out a two months now; making this post a bit late, but still in time before the next release. Also so much drama in the CoC to care about performance topics :P As always comes with a series of performance enhancements and optimizations across subsystems.&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
locking: avoid pointless TEST instructions&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A number of places within locking primitives have been optimized to avoid superfluous &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;test&lt;/span&gt; instructions for the CAS return by relying on &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;try_cmpxchg&lt;span style=&quot;font-family: &amp;quot;helvetica neue&amp;quot; , &amp;quot;arial&amp;quot; , &amp;quot;helvetica&amp;quot; , sans-serif;&quot;&gt;,&lt;/span&gt;&lt;span style=&quot;font-family: &amp;quot;verdana&amp;quot; , sans-serif;&quot;&gt; generating slightly better code for x86-64&lt;/span&gt;&lt;/span&gt; (for arm64 there is really no difference). Such have been the cases for mutex fastpath (uncontended case) and queued spinlocks.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c427f69564e2a844c5fcf2804042609342513da0&quot;&gt;c427f69564e2&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ae75d9089ff7095d1d1a12c3cd86b21d3eaf3b15&quot;&gt;ae75d9089ff7&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
locking/mcs: optimize cpu spinning&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Some architectures, such as arm64,&amp;nbsp; can enter low-power standby state (spin-waiting) instead of purely spinning on a condition. This is applied to the MCS spin loop, which in turn directly helps queued spinlocks. On x86, this can also be cheaper than spinning on &lt;i&gt;smp_load_acquire()&lt;/i&gt;.&lt;br /&gt;
&lt;pre id=&quot;pre_2011-10-28 14:06:00-9999999997-p&quot; style=&quot;white-space: pre;&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7f56b58a92aaf2cab049f32a19af7cc57a3972f2&quot;&gt;7f56b58a92aa&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/mremap: reduce amount of TLB shootdowns&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
It was discovered that on a heavily dominated &lt;i&gt;mremap&lt;/i&gt; workload, the amount of TLB flushes was excessive causing overall performance issues. By removing the &lt;i&gt;LATENCY_LIMIT&lt;/i&gt; magic number to handle TLB flushes on a PMD boundary instead of every 64 pages,&amp;nbsp; the amount of shootdowns can be redced by a factor of 8 in the ideal case.&amp;nbsp; The &lt;i&gt;LATENCY_LIMIT &lt;/i&gt;was almost certainly used originally to limit the PTL hold times but the latency savings are likely shadowed by the cost of IPIs in many cases.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37a4094e828f3c7673aa9c60f8b2b9d1019db81b&quot;&gt;37a4094e828f&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: replace mmap_sem to protect cmdline and environ procfs files&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Reducing (ab)users of the mmap_sem is always good for general address space performance. Introduce a new mm-&amp;gt;arg_lock to protect against races when handling &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;/proc/$PID/{cmdline,environ}&lt;/span&gt; files, this removes (mostly) the semaphore&#39;s requirements.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=88aa7cc688d48ddd84558b41d5905a0db9535c4b&quot;&gt;88aa7cc688d4&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm/hugetlb: make better use of page clearing optimization&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Pass
 the fault address (address of the sub-page to access) to the nopage 
fault handler to better use the general huge page clearing optimization.
 This allows the sub-page to access to be cleared last to avoid the 
cache lines of to access sub-page to be evicted when clearing other 
sub-pages. Performance improvements were reported for &lt;i&gt;vm-scalability.anon-w-seq&lt;/i&gt;&amp;nbsp; workload under hugetlbfs, reducing ~30% throughput.&lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=285b8dcaacfc36b0468aaa03e3c628006ae31381&quot;&gt;285b8dcaacfc&lt;/a&gt;]&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched: don&#39;t schedule threads on pre-empted vCPUs&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
It can be determined whether a vCPU is
running to prioritize CPUs when scheduling threads. If a
vCPU has been pre-empted, it will incur the extra cost of VMENTER and
the time it actually spends to be running on the host CPU. If we had
other vCPUs which were actually running on the host CPU and idle we
should schedule threads there.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=247f2f6f3c706b40b5f3886646f3eb53671258bf&quot;&gt;247f2f6f3c70&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=943d355d7feef380e15a95892be3dff1095ef54b&quot;&gt;943d355d7fee&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched/numa: Stagger NUMA balancing scan periods for new threads&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
It is redundant and counter productive for threads sharing an address space to change the protections to trap NUMA faults. Potentially
only one thread is required but that thread may be idle or it may not have
any locality concerns and pick an unsuitable scan rate.

This patch uses independent scan period but they are staggered based on
the number of address space users when the thread is created. &lt;br /&gt;
&lt;br /&gt;
The intent
is that threads will avoid scanning at the same time and have a chance
to adapt their scan rate later if necessary. This reduces the total scan
activity early in the lifetime of the threads.

The different in headline performance across a range of machines and
workloads is marginal but the system CPU usage is reduced as well as overall
scan activity. &lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1378447598432513d94ce2c607c412dc4f260f31&quot;&gt;137844759843&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
block/bfq: postpone rq preparation to insert or merge&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A lock contention point is removed (see patch for details and justification) by postponing request preparation to insertion or merging, as lock needs to be grabbed any longer in the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;prepare_request&lt;/span&gt; hook. &lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=18e5a57d79878b205d39b2f160082d9098e9bfd6&quot;&gt;18e5a57d7987&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
btrfs: improve rmdir performance for large directories&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When checking if a directory can be deleted, instead of ensuring all its children have been processed,&amp;nbsp; this optimization keeps track of the directory index offset of the child last checked in the last call to &lt;i&gt;can_rmdir()&lt;/i&gt;, and then use it as the starting point for future calls. The changes were shown to yield massive performance benefits; for test directory with two million files being deleted the runtime is reduced from half an hour to less than two seconds. &lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f96f517dcaa58346c32be094aecd610b7d3c008&quot;&gt;0f96f517dcaa&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
KVM: VMX: Optimize tscdeadline timer latency&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Add
 the advance tscdeadline expiration support to which the tscdeadline 
timer is emulated by VMX preemption timer to reduce the hypervisor 
lantency (&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;handle_preemption_timer -&amp;gt; vmentry&lt;/span&gt;).
 The guest can also set an expiration that is very small in that case we
 set delta_tsc to 0, leading to an immediately vmexit when delta_tsc is 
not bigger than advance ns. This patch can reduce ~63% latency for 
kvm-unit-tests/tscdeadline_latency when testing busy waits. &lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5ce8235cffa00c207e24210329094d7634bb467&quot;&gt;c5ce8235cffa&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
net/sched: NOLOCK qdisc performance enhancements and&amp;nbsp; fixes&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
There have been various performance related core changes to the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;NOLOCK&lt;/span&gt; qdisc code. The first begins with reducing the atomic operations of &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;__QDISC_STATE_RUNNING&lt;/span&gt;. The bit is flipped twice per packet in the uncontended scenario with packet rate below the line rate: on packed dequeue and on the next, failing dequeue attempt. The changes simplify the qdisc. The changes moves the bit manipulation into the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;qdisc_run_{begin,end}&lt;/span&gt; helpers, so that the bit is now flipped only once per packet, with measurable performance improvement in the uncontended scenario.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Later, the above is actually replaced by using a sequence spinlock instead of the atomic approach address &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;pfifo_fast&lt;/span&gt; performance regressions. There is also a reduction in the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;Qdisc&lt;/span&gt; struct memory footprint (spanning a cacheline less).&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=96009c7d500efdd5534e83b2e3eb2c58d4b137ae&quot;&gt;96009c7d500e&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=021a17ed796b62383f7623f4fea73787abddad77&quot;&gt;021a17ed796b&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e9be0e993d95adbe5efe0e0f03b2a3e71f5bb2b6&quot;&gt;e9be0e993d95&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/idr: improve scalability by reducing IDA lock granularity&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Improve the scalability of the IDA by using the per-IDA &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;xa_lock&lt;/span&gt; rather than the global &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;simple_ida_lock&lt;/span&gt;.&amp;nbsp; IDAs are not typically used in performance-sensitive locations, but since we have this lock anyway, we can use it. &lt;/div&gt;
&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b94078e69533ba237e2c229bca61bae47e6fafcc&quot;&gt;b94078e69533&lt;/a&gt;]&amp;nbsp; &lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
x86-64: micro-optimize __clear_put()&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Use immediate constants and saves two registers.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1153933703d927b3d4874c0bc801de32b1b58be9&quot;&gt;1153933703d9&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
arm64: select ARCH_HAS_FAST_MULTIPLIER&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
It is probably safe to assume that all Armv8-A implementations have a multiplier whose efficiency is comparable or better than a sequence of three or so register-dependent arithmetic instructions. Select ARCH_HAS_FAST_MULTIPLIER to get ever-so-slightly nicer codegen in the few dusty old corners which care.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e75bef2a4fe259b779765a85589e92657d26fdc9&quot;&gt;e75bef2a4fe2&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/6461425334288193951/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2018/10/linux-v418-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6461425334288193951'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6461425334288193951'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2018/10/linux-v418-performance-goodies.html' title='Linux v4.18: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-811100010210661002</id><published>2018-06-05T07:51:00.003-07:00</published><updated>2018-06-05T07:51:41.602-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.17"/><title type='text'>Linux v4.17: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With Linux v4.17 now released, there are some interesting performance changes that went worth looking at. As always, the
 term &#39;&lt;i&gt;performance&lt;/i&gt;&#39; can be vague in 
that some gains in one area can negatively affect another so take everything with a grain of salt.&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sysvipc: introduce STAT_ANY commands&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
There was a permission discrepancy when consulting shm ipc object metadata
between &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;/proc/sysvipc/shm&lt;/span&gt; (0444) and getting stat info (such as via &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;SHM_STAT shmctl&lt;/span&gt; command).  The
later does permission checks for the object vs S_IRUGO.  As such there can
be cases where EACCESS is returned via syscall but the info is displayed
anyways in the procfs files.

While this might have security implications via info leaking (albeit no
writing to the shm metadata), this behavior goes way back and showing all
the objects regardless of the permissions was most likely an overlook - so
we are stuck with it.&lt;/div&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Some
applications require getting the procfs info (without root privileges) and
can be rather slow in comparison with a syscall -- up to 500x in some
reported cases. For this, the new &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;{SEM,SHM,MSG}_STAT_ANY &lt;/span&gt;commands have been introduced.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c21a6970ae727839a2f300cd8dd957de0d0238c3&quot;&gt;c21a6970ae72&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a280d6dc77eb6002f269d58cd47c7c7e69b617b6&quot;&gt;a280d6dc77eb&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23c8cec8cf679b10997a512abb1e86f0cedc42ba&quot;&gt;23c8cec8cf67&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
kvm: x86 paravirtualization hints and KVM_HINTS_DEDICATED&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When
 dealing with CPU virtualization, many in-kernel heuristics and 
optimizations revolve around the overcommited scenario.&amp;nbsp; By introducing &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;KVM_HINTS_DEDICATED&lt;/span&gt;,
 the hypervisor administrator can select this option when there are 
pinned 1:1 virtual to physical CPU scenarios; particularly reducing the 
paravirt overhead in locking and TLB flushing as the vCPU is most 
unlikely to get preempted. In these cases, native qspinlock may perform 
better than pvqspinlock as it disables paravirt spinlock slowpath 
optimizations. There is an older Xen equivalent available as a kernel 
parameter: &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;xen_nopvspin&lt;/span&gt;. &lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b2798ba0b8769b42f00899b44a538b5fcecb480d&quot;&gt;b2798ba0b876&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34226b6b70980a8f81fff3c09a2c889f77edeeff&quot;&gt;34226b6b7098&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6beacf74c25711d5ee83412a3abc839af8ce6697&quot;&gt;6beacf74c257&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched: rework idle loop&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Rework
 the idle loop in order to prevent CPUs from spending too much time in 
shallow idle states by making it stop the scheduler tick before putting 
the CPU into an idle state only if the idle duration predicted by the 
idle governor is long enough.  It reduces idle power on some systems by 
10% or more and may improve performance of workloads in which the idle 
loop overhead matters. This required the code to be reordered to invoke 
the idle governor before stopping the tick, among other things&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0e7767687fdabfc58d5046e7488632bf2ecd4d0c&quot;&gt;0e7767687fda&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2aaf709a518d26563b80fd7a42379d7aa7ffed4a&quot;&gt;2aaf709a518d&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ed98c34919985a9f87c3edacb9a8d8c283c9e243&quot;&gt;ed98c3491998&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&amp;nbsp;mm: pcpu pages optimizations around zone lock&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Two optimizations around &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;zone-&amp;gt;lock&lt;/span&gt; in &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;free_pcpupages_bulk()&lt;/span&gt;
 that yield around a 5% performance improvement in page-fault benchmarks
 (will-it-scale in this case). The first reduces the scope of the&amp;nbsp; when 
freeing a batch of pages from back to buddy. Considering the per-cpu 
semantics, the lock was unnecessarily&amp;nbsp; held while pages are chosen from 
the pcpu page&#39;s migratetype
list.&lt;br /&gt;
&lt;br /&gt;
The second improvement adds a prefetch to the 
to-be-freed page&#39;s buddy outside of&amp;nbsp; the lock in hope that accessing the
 buddy&#39;s page structure later with the lock held will be faster. 
Normally prefetching is froundupon, particularly for microbenchmarks, 
however in the particular case the prefetched pointer will always be 
used.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a5f4e5b45625e75db85b4968fc4c232d8091143&quot;&gt;0a5f4e5b4562&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=97334162e4d79f866edd7308aac0ab3ab7a103f7&quot;&gt;97334162e4d7&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: lockless list_lru_count_one()&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
During the reclaiming slab of a memcg, &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;shrink_slab()&lt;/span&gt; iterates over all
registered shrinkers in the system, trying to count and consume
objects related to the cgroup.  In case of memory pressure, the operation was had a bottlenecking while trying to acquire the &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;nlru-&amp;gt;lock&lt;/span&gt;.
 By applying RCU to the data structure, the lookup can be done without 
taking the lock, which translates in the overall contention pretty much 
disappearing.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c7c1bed7e13dbb545375c231e6ba1dca5e8d725&quot;&gt;0c7c1bed7e13&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
memory hotplug optimizations&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Such
 optimizations reduce the amount of times struct pages is traversed 
during a memory hotplug operation, from three to one. Among other 
benefits, the memory hotplug is made similar to the boot memory 
initialization
   path because it initializes struct pages only in one
   function. Finally, this improves memory hotplug performance because 
the cache is not being evicted several times and also reduce loop 
branching overhead.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d0dc12e86b3197a14a908d4fe7cb35b73dda82b5&quot;&gt;d0dc12e86b31&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
procfs: miscellaneous optimizations&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Access to various files within procfs have been optimized by replacing calls to &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;seq_printf()&lt;/span&gt; with lower cost alternatives. Changes show some performance benefits for ad-hoc microbenchmarks.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0e3dc019143104a6e676287b1e453cccd7add404&quot;&gt;0e3dc0191431&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8cfa67b4d9a9d9a6061f3cfd0e0ed16e66e45984&quot;&gt;8cfa67b4d9a9&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d1be35cb6f96975d792a1535d3fe9b75239065ee&quot;&gt;d1be35cb6f96&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f66406638fffe874c56e7e41106167c5235f251e&quot;&gt;f66406638fff&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=48dffbf82d2f17bc6dd3c2b7fd733738ea567914&quot;&gt;48dffbf82d2f&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d0f02231222b313d1b49278cd2e3c7e7406fea6d&quot;&gt;d0f02231222b&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
btrfs: relax barrier when unlocking an extent buffer&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Serializing checks for active waitqueue requires a barrier as it can race with&amp;nbsp; the waiter side. Such is the case with &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;btrfs_tree_unlock()&lt;/span&gt;,
 which was abusing the barrier semantics on architectures where atomic 
operations are ordered, such as x86. A performance improvement is 
immediately noticeable by optimizing barrier usage while maintaining the
 necessary semantics.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e32ef87b074cb8098436634b649b4b2b523acbe&quot;&gt;2e32ef87b074&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
x86/pti: leave kernel text global for no PCID&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
From the patch: Global pages are bad for hardening because they potentially let an
exploit read the kernel image via a Meltdown-style attack.

But, global pages are good for performance because they reduce TLB
misses when making user/kernel transitions, especially when PCIDs
are not available, such as on older hardware, or where a hypervisor
has disabled them for some reason.&lt;br /&gt;
&lt;br /&gt;
This change implements a basic, sane policy: If PCIDs are available, only map a minimal amount of kernel text global.  If no
PCIDs, map all kernel text global. This translates into a considerable throughput increase on an &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;lseek&lt;/span&gt; microbenhmark.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8c06c7740d191b9055cb9be920579d5ecdd26303&quot;&gt;8c06c7740d19&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
lib/raid6/altivec: Add vpermxor implementation for raid6 Q syndrome&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This enhancement uses the &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;vpermxor&lt;/span&gt; instruction to optimize the raid6 Q
syndrome. This instruction was made available with POWER8, ISA version
2.07. It allows for both vperm and vxor instructions to be done in a
single instruction. The benchmark results show a 35%
speed increase over the best existing algorithm for powerpc (altivec).&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=751ba79cc552c146595cd439b21c4ff8998c3b69&quot;&gt;751ba79cc552&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/811100010210661002/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2018/06/linux-v417-performance-goodies_5.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/811100010210661002'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/811100010210661002'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2018/06/linux-v417-performance-goodies_5.html' title='Linux v4.17: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-5909398934385689842</id><published>2018-05-07T10:53:00.001-07:00</published><updated>2018-05-07T10:53:14.743-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.16"/><title type='text'>Linux v4.16: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;
Linux v4.16 was released a few weeks ago and continues the mitigation of meltdown and spectre bugs for x86-64, as well as for arm64 and IBM s390. While v4.16 is not the most exciting kernel version in terms of performance and scalability, the following is an unsorted and incomplete list of changes that went in which I have cherry-picked. As always, the term &#39;&lt;i&gt;performance&lt;/i&gt;&#39; can be vague in 
that some gains in one area can negatively affect another so take everything with a grain of salt.&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched: reduce migrations and spreading of load to multiple CPUs&lt;/h4&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
The scheduler decisions are biased towards reducing latency of searches but tends to spread load across an entire socket, unnecessarily. On low CPU usage, this means the load on each individual CPU is low which can be good but cpufreq decides that utilization on individual CPUs is too low to increase P-state and overall throughput suffers.&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div itemprop=&quot;articleBody&quot; style=&quot;text-align: justify;&quot;&gt;
When a cpufreq driver is completely under the control of the OS, it can be compensated for. For example, &lt;i&gt;intel_pstate&lt;/i&gt; can decide to boost apparent cpu utilization if a task recently slept on a CPU for idle. However, if hardware-based cpufreq is in play (e.g. hardware P-states HWP) then very poor decisions can be made and the OS cannot do much about it. This only gets worse as HWP becomes more prevalent, sockets get larger and the p-state for individual cores can be controlled. Just setting the performance governor is not an answer given that plenty of people really do worry about power utilization and still want a reasonable balance between performance and power. Experiments show performance benefits for network benchmarks running on localhost (at ~10% on netperf RR for UDP
and TCP, depending on the machine). Hackbench also has some small improvements with ~6-11%, depending on machine and thread count.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=89a55f56fd1cdbe7e69d4693fc5790af9a6e1501&quot;&gt;89a55f56fd1c&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b76c4a33959ca98a573cd9c94c8690d123912ca&quot;&gt;3b76c4a33959&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=806486c377e33ab662de6d47902e9e2a32b79368&quot;&gt;806486c377e3&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=32e839dda3ba576943365f0f5817ce5c843137dc&quot;&gt;32e839dda3ba&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
printk: new locking scheme&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Problems around the kernel&#39;s &lt;a href=&quot;https://lwn.net/Articles/737822/&quot;&gt;printk()&lt;/a&gt; call aren&#39;t new and traditionally must overcome issues with the console lock. Considering that the kernel printing out to the console is very generic operation which can be called from virtually anywhere at any time, relying on any sort of lock can cause deadlocks. Similarly, the call to printk() must proceed regardless of the availability of the
console lock. As such, what would happen is that upon contention, the task buffers the output for the console lock owner to flush as when it releases the lock.&lt;br /&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
On large multi-core systems this scheme can lead to the console owner to pile up a lot unbound work before it can release the lock, triggering watchdog lockups. This was replaced with a new mechanism that, upon contention, the task will not delay the work to the console lock owner and return, but it&#39;ll stay around spinning until it is available. The heuristics imply a console owner and waiter such that if
multiple CPUs are generating output, the console lock will circulate between them,
and none will end up printing output for too long.
&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=dbdda842fe96f8932bae554f0adf463c27c42bc7&quot;&gt;dbdda842fe96&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
idr tree optimizations &lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With the &lt;a href=&quot;http://lkml.kernel.org/r/20180207211918.GA11985@bombadil.infradead.org&quot;&gt;extensions and improvements&lt;/a&gt;
 of the ID allocation API, there is a performance enhancement for ID numbering 
schemes that don&#39;t start at 0; which, according to the patch, accounts 
for ~20% of all the kernel users. So by using the new idr functions with the
 &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;, Courier, monospace;&quot;&gt;_base()&lt;/span&gt; suffix users can immediately benefit from unnecessary iterations in the underlying radix tree.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=6ce711f2750031d12cec91384ac5cfa0a485b60a&quot;&gt;6ce711f27500&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&amp;nbsp;arm64: 52-bit physical address support&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With ARMv8.2 the physical address space is extended from 48 to 52-bit, thus tasks are now able to address up to &lt;span&gt;4 pebibytes (&lt;/span&gt;&lt;span&gt;PiB).&lt;/span&gt;&lt;br /&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=fa2a8445b1d3810c52f2a6b3a006456bd1aacb7e&quot;&gt;fa2a8445b1d3&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=193383043f14a398393dc18bae8380f7fe665ec3&quot;&gt;193383043f14&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=529c4b05a3cb2f324aac347042ee6d641478e946&quot;&gt;529c4b05a3cb&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=787fd1d019b269af7912249231dfe34a5fe3e7c8&quot;&gt;787fd1d019b2&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/5909398934385689842/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2018/05/linux-v416-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5909398934385689842'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5909398934385689842'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2018/05/linux-v416-performance-goodies.html' title='Linux v4.16: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-5818561405782090538</id><published>2018-03-20T10:37:00.003-07:00</published><updated>2018-03-20T10:37:53.337-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.15"/><title type='text'>Linux v4.15: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With the &lt;a href=&quot;https://lwn.net/Articles/741878/&quot;&gt;Meltdown&lt;/a&gt; and &lt;a href=&quot;https://lwn.net/Articles/743265/&quot;&gt;Spectre&lt;/a&gt; &lt;a href=&quot;https://lwn.net/Articles/744287/&quot;&gt;fiascos&lt;/a&gt;, performance isn&#39;t a very hot topic at the moment. In fact, with Linux v4.15 released, it is one of the rare times I&#39;ve seen security win over performance in such a one sided way. Normally security features are tucked away under a kernel config option nobody really uses. Of course the software fixes are also backported in one way or another, so this isn&#39;t really specific to the latest kernel release.&lt;br /&gt;
&lt;br /&gt;
All this said, v4.15 came out with a few performance enhancements across subsystems. The following is an unsorted and incomplete list of changes 
that went in. Note that the term &#39;&lt;i&gt;performance&lt;/i&gt;&#39; can be vague in 
that some gains in one area can negatively affect another, so take 
everything with a grain of salt and reach your own conclusions.&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
epoll: scale nested calls&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Nested epolls are necessary to allow semantics where a file descriptor in the epoll interested-list is also an epoll instance. Such calls are not all that common, but some real world applications suffered severe performance issues in that it relied on global spinlocks, acquired throughout the callbacks in the epoll state machine. By removing them, we can speed up adding fds to the instance as well as polling, such that &lt;i&gt;epoll_wait()&lt;/i&gt; can improve by 100x, scaling linearly when increasing amounts of cores block an an event.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=57a173bdf5baab48e8e78825c7366c634acd087c&quot;&gt;57a173bdf5ba,&lt;/a&gt;&amp;nbsp; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=37b5e5212a448bac0fe29d2a51f088014fbaaa41&quot;&gt;37b5e5212a44&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
pvspinlock: hybrid fairness paravirt semantics&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Locking under virtual environments can be tricky, balancing performance and fairness while avoiding artifacts such as starvation and lock holder/waiter preemption. The current paravirtual queued spinlocks, while free from starvation, can perform less optimally than an unfair lock in guests with CPU over-commitment. With Linux v4.15, guest spinlocks now combine the best of both worlds, with an unfair and a queued mode. The idea is that, upon contention, extend the lock stealing attempt in the slowpath (unfair mode) as long as there are queued MCS waiters present, hence improving performance while avoiding starvation. Kernel build experiments show that as a VM becomes more and more over-committed, the ratio of locks acquired in unfair mode increases.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=11752adb68a388724b1935d57bf543897c34d80b&quot;&gt;11752adb68a3&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm,x86: avoid saving/restoring interrupts state in gup&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
When x86 was converted to use the generic &lt;i&gt;get_user_pages_fast() &lt;/i&gt;call a performance regression was introduced at a microbenchmark level. The generic &lt;i&gt;gup&lt;/i&gt; function attempts to walk the page tables without acquiring any locks, such as the mmap semaphore. In order to do this, interrupts must be disabled, which is where things went different between the arch-specific and generic flavors. The later must save and restore the current state of interrupt, introducing extra overhead when compared to a simple &lt;i&gt;local_irq_enable/disable()&lt;/i&gt;.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5b65c4677a57a1d4414212f9995aa0e46a21ff80&quot;&gt;5b65c4677a57&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
ipc: scale INFO commands&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Any syscall used to get info from sysvipc (such as &lt;i&gt;semctl(IPC_INFO)&lt;/i&gt; or &lt;i&gt;shmctl(SHM_INFO)&lt;/i&gt;) requires internally computing the last ipc identifier. For cases with large amounts of keys, this operation alone can consume a large amount of cycles as it looked up on-demand, in O(N). In order to make this information available in constant time, we keep track of it whenever a new identifier is added.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15df03c87983660a4d1eedb4541778592bd97684&quot;&gt;15df03c87983&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
ext4:&amp;nbsp; improve smp scalability for inode generation&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The superblock&#39;s inode generation number was currently sequentially increased (from a randomly initialized value) and protected by a spinlock, making the usage pattern quite primitive and not very friendly to workloads that are generating files/inodes concurrently. The inode generation path was optimized to remove the lock altogether and simply rely on &lt;i&gt;prandom_u32()&lt;/i&gt; such that a fast/seeded pseudo random-number algorithm is used for computing the &lt;i&gt;i_generation&lt;/i&gt;.&lt;/div&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=232530680290ba94ca37852ab10d9556ea28badf&quot;&gt;232530680290&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/5818561405782090538/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2018/03/linux-v415-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5818561405782090538'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5818561405782090538'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2018/03/linux-v415-performance-goodies.html' title='Linux v4.15: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-6088022618630093249</id><published>2017-11-20T07:50:00.001-08:00</published><updated>2017-11-20T07:50:20.482-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="v4.14"/><title type='text'>Linux v4.14: Performance Goodies</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Last week Linus released the v4.14 kernel with some noticeable performance changes. The following is an unsorted and incomplete list of changes that went in. Note that the term &#39;&lt;i&gt;performance&lt;/i&gt;&#39; can be vague in that some gains in one area can negatively affect another, so take everything with a grain of salt and reach your own conclusions.&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sysvipc: scale key management &lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
We began using relativistic hash tables for managing ipc keys, which greatly improves the current &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;O(N)&lt;/span&gt; lookups. As such, ipc_findkey() calls are significantly faster (+800% in some reaim file benchmarks) and we need not iterate all elements each time. Improvements are even seen in&amp;nbsp; scenarios where the amount of keys is but a handful, so this is pretty much a win from any standpoint. &lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0cfb6aee70bddbef6ec796b255f588ce0e126766&quot;&gt;0cfb6aee70bd]&lt;/a&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&amp;nbsp; &lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
interval-tree: fast overlap detection&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With the new extended rbtree api to cache the smallest (leftmost) node, instead of doing &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;O(logN)&lt;/span&gt; walks to the end of the tree, we have the pointer always available. This allows to extend and complete the fast overlap detection for interval trees to speedup (sub)tree searches if the interval is completely to the left or right of the current tree&#39;s max interval. In addition, a number of other users that traverse rbtrees are updated to use the new rbtree_cached, such as epoll, procfs and cfq.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit&amp;nbsp; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a&quot;&gt;cd9e61ed1eeb&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=410bd5ecb276593e7ec1552014083215d4a43c3a&quot;&gt;410bd5ecb276&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2554db916586b228ce93e6f74a12fd7fe430a004&quot;&gt;2554db916586, &lt;/a&gt;&lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b2ac2ea6296e7dd779168eb085b09d0fab9d1294&quot;&gt;b2ac2ea6296a&lt;/a&gt;,&amp;nbsp; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f808c13fd3738948e10196496959871130612b61&quot;&gt;f808c13fd373&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
sched: waitqueue bookmarks&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A situation where constant NUMA migrations of a hot-page triggered large number of page waiters being awoken exhibited some issues in the waitqueue implementation. In such cases, large number of wakeups will occur while holding a spinlock, which causes significant unbounded lantencies. Unlike wake_qs (used in futexes and locks), where batched wakeups are done without the lock, waitqueue bookmarks allow to to pause and stop iterating the wake list such that another process has a chance to acquire the lock. Then it can resume where it left off.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3510ca20ece0150af6b10c77a74ff1b5c198e3e2&quot;&gt;3510ca20ece&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2554db916586b228ce93e6f74a12fd7fe430a004&quot;&gt;2554db916586&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=11a19c7b099f96d00a8dec52bfbb8475e89b6745&quot;&gt;11a19c7b099f&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&amp;nbsp;x86 PCID (Process Context Identifier)&lt;/h4&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This is a 64-bit hardware feature that allows tagging TLBs such that upon context switching, only flush the required entries. For virtualization (VT-x) this has &lt;a href=&quot;http://blog.stgolabs.net/2012/05/kvm-intel-associative-tlbs.html&quot;&gt;supported&lt;/a&gt; similar features for a while, via &lt;i&gt;vpid&lt;/i&gt;. On other archs it is called address space ID. Linux&#39;s support is somewhat special. In order to avoid the x86 limitations of 4096 IDs (or processes), the implementation actually uses a PCID to identify a
recently-used mm (process address space) on a per-cpu basis.  An mm has no fixed PCID
binding at all; instead, it is given a fresh PCID each time it&#39;s
loaded, except in cases where we want to preserve the TLB, in which
case we reuse a recent value. To illustrate, a workload under kvm that ping pongs two processes, dTLB misses were reduced by ~17x.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f39681ed0f48498b80455095376f11535feea332&quot;&gt;f39681ed0f48&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b0579ade7cd82391360e959cc844e50a160e8a96&quot;&gt;b0579ade7cd8&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94b1b03b519b81c494900cb112aa00ed205cc2d9&quot;&gt;94b1b03b519b&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5&quot;&gt;43858b4f25cf&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cba4671af7550e008f7a7835f06df0763825bf3e&quot;&gt;cba4671af755&lt;/a&gt;,&amp;nbsp; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0790c9aad84901ca1bdc14746175549c8b5da215&quot;&gt;0790c9aad849&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=660da7c9228f685b2ebe664f9fd69aaddcc420b5&quot;&gt;660da7c9228f&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=10af6235e0d327d42e1bad974385197817923dc1&quot;&gt;10af6235e0d3&lt;/a&gt;]&amp;nbsp;&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&amp;nbsp;&lt;/h4&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
ORC (Oops Rewind Capability) Unwinder&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The much acclaimed replacement to frame pointers and the (out of tree) DWARF unwinder. Through simplicity, the end result is faster profiling, such as for perf. Experiments show a 20x performance increase using ORC vs DWARF while calling save_stack_trace 20,000 times via single &lt;i&gt;vfs_write&lt;/i&gt;. With respect to frame pointers, the ORC unwinder is more accurate across interrupt entry frames and enables a 5-10% performance improvement across the entire kernel compared to frame pointers.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ee9f8fce99640811b2b8e79d0d1dbe8bab69ba67&quot;&gt;ee9f8fce9964&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=39358a033b2e4432052265c1fa0f36f572d8cfb5&quot;&gt;39358a033b2e&lt;/a&gt;]&lt;br /&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: choose swap device according to numa node&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;div class=&quot;content&quot;&gt;
If the system has more than one swap device and swap device has the node
information, we can make use of this information to decide which swap
device to use in &lt;i&gt;get_swap_pages() &lt;/i&gt;to get better performance. This change replaces a single
global swap_avail list with a per-numa-node list: each numa node sees its own priority based list of available swap devices. Swap
device&#39;s priority can be promoted on its matching node&#39;s swap_avail_list. Shows ~25% improvements for a 2 node box, benchmaring random writes on mmaped region withSSDs attached to each node, ensuring swapping in and out.&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a2468cc9bfdff6139f59ca896671e5819ff5f94a&quot;&gt;a2468cc9bfdf&lt;/a&gt;]&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
mm: reduce cost of page allocator&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Upon page allocation, the per-zone statistics are updated, introducing overhead in the form of cacheline bouncing; responsible for ~30% of all CPU cycles&amp;nbsp; for allocating a single page. The networking folks have been known to complain about the performance degradation when dealing with the memory management subsystem, particularly the page allocator. The fact that these NUMA associated counters are rarely used allows the counter threshold that determines the frequency of updating the global counter with the percpu counters (hence cacheline bouncing) to be increased. This means hurting readers, but that&#39;s the point.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3a321d2a3dde812142e06ab5c2f062ed860182a5&quot;&gt;3a321d2a3dde&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1d90ca897cb05cf38bd62f36756d219e02913b7d&quot;&gt;1d90ca897cb0&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=638032224ed762a29baca1fc37f1168efc2554ae&quot;&gt;638032224ed7&lt;/a&gt;]&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
archs: multibyte memset&lt;/h4&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
New calls memset16(), memset32() and memset64() are introduced, which are like memset(), but allow the caller to fill the destination with a value larger than a single byte. There are a number of places in the kernel that can benefit from using an optimized function rather than a loop; sometimes text size, sometimes speed, and sometimes both. When supported by the architecture, use a single instruction, such as &lt;i&gt;stosq&lt;/i&gt; (&lt;span class=&quot;st&quot;&gt;stores a quadword) in x86-64. Zram shows a 7% performance improvement on x86 with a 100Mb non-zero deduplicate data. If not available, default back to the slower loop implementation&lt;/span&gt;.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commits&amp;nbsp; &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b3c4babd898715926d24ae10aa64778ace33aae&quot;&gt;3b3c4babd898&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=03270c13c5ffaa6ac76fe70d0b6929313ca73d86&quot;&gt;03270c13c5ff&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c51248533adcfb01ba704ce5993ecbad5cc4c99&quot;&gt;4c51248533ad&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=48ad1abef40226ce809e5b7d3a5898754c4b9a9a&quot;&gt;48ad1abef402&lt;/a&gt;]&lt;/div&gt;
&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
powerpc: improve TLB flushing&lt;/h4&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A
 few optimisations were also added to the radix MMU TLB flushing, mostly
 to avoid unnecessary Page Walk Cache (PWC) flushes when the structure 
of the tree is not changing.&lt;/div&gt;
&lt;div style=&quot;text-align: right;&quot;&gt;
[Commit &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a46cc7a90fd8d95bfbb2b27080efe872a1a51db4&quot;&gt;a46cc7a90fd8&lt;/a&gt;, &lt;a href=&quot;https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=424de9c6e3f89399fc11afc1f53f89c5329132da&quot;&gt;424de9c6e3f8&lt;/a&gt;]&lt;br /&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
&lt;/h4&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
There are plenty of other performance optimizations out there, including ext4 parallel file creation and quotas, additional memset improvements in sparc, transparent hugepage migrations and swap improvements, ipv6 (ip6_route_output()) optimizations, etc. Again, the list here is partial and biased by me. For more list of features play with &#39;git log&#39; or visit lwn (&lt;a href=&quot;https://lwn.net/Articles/733175/&quot;&gt;part1&lt;/a&gt;, &lt;a href=&quot;https://lwn.net/Articles/733846/&quot;&gt;part2&lt;/a&gt;) and &lt;a href=&quot;https://kernelnewbies.org/Linux_4.14&quot;&gt;kernelnewbies&lt;/a&gt;.&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/6088022618630093249/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2017/11/linux-v414-performance-goodies.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6088022618630093249'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6088022618630093249'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2017/11/linux-v414-performance-goodies.html' title='Linux v4.14: Performance Goodies'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-3926339682891681060</id><published>2015-12-29T05:07:00.002-08:00</published><updated>2015-12-29T05:36:29.239-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="auditing"/><category scheme="http://www.blogger.com/atom/ns#" term="futex"/><category scheme="http://www.blogger.com/atom/ns#" term="fuzzy testing"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="locks"/><category scheme="http://www.blogger.com/atom/ns#" term="security"/><category scheme="http://www.blogger.com/atom/ns#" term="stressing software"/><category scheme="http://www.blogger.com/atom/ns#" term="system call"/><category scheme="http://www.blogger.com/atom/ns#" term="target fuzzing"/><category scheme="http://www.blogger.com/atom/ns#" term="trinity"/><category scheme="http://www.blogger.com/atom/ns#" term="userspace mutexes"/><title type='text'>fu(zz)tex: targeted fuzzing of futexes</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The complexity of futexes, their non-trivial interactions and semantics, very much serve as a good candidate for applying fuzzy testing techniques to them. In general futex code is poorly understood and audited, both at a kernel implementation level and by the respective userland callers, normally trying to implement some sort of locking primitive. Unsurprisingly, bugs related to this call will often be subtle and nasty, sometimes with &lt;a href=&quot;http://www.cvedetails.com/google-search-results.php?q=futex&amp;amp;sa=Search&quot;&gt;security&lt;/a&gt; implications. Specifically for futexes, all system call fuzzers use generic and completely randomized inputs, which has only limited usefulness. This is even the case for Dave Jones&#39; &lt;a href=&quot;http://codemonkey.org.uk/projects/trinity&quot;&gt;trinity&lt;/a&gt; program, which has been extremely good at finding kernel bugs (and ruining my weekends more than once ;). Much of the success and popularity of this program is because not all the inputs are random and meaningful parameters are passed for many of the exercised syscalls. This is called targeted fuzzing, and has been proven to find more bugs than blindly random inputs, which in turn is more likely to produce logic that makes the kernel actually do something related to the call, as opposed to quickly erroring out due to some trivial bogus scenario. A nice example is the &lt;i&gt;perf_event_open(2)&lt;/i&gt; call, which was &lt;a href=&quot;http://web.eece.maine.edu/~vweaver/projects/perf_events/fuzzer/2015_perf_fuzzer_tr.pdf&quot;&gt;studied&lt;/a&gt; for targeted fuzzy testing with very good results.&lt;/div&gt;
&lt;h3 style=&quot;text-align: justify;&quot;&gt;
Extending Trinity &lt;/h3&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Reusing the already proven-to-work machinery of trinity. and extend it for futex ad-hoc work, is the obvious step for improving coverage, in the hope to tackle some of the issues previously described. While reading the code is always the definite answer, having a man-page that is &lt;i&gt;up-to-par&lt;/i&gt; with the call is quite essential; if we want programmers to make correct use of the tools we provide, that is. Fortunately, Michael Kerrisk has been doing a nice job of &lt;a href=&quot;https://git.kernel.org/cgit/docs/man-pages/man-pages.git/tree/man2/futex.2&quot;&gt;rewriting&lt;/a&gt; the current &lt;i&gt;futex.2&lt;/i&gt; page, which is so surprisingly crappy and incomplete, it&#39;s sad. This makes the task correctly setting the input parameters following a certain purpose a little less tedious and error-prone:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; &lt;/code&gt;&lt;/span&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt;&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
&amp;nbsp;&amp;nbsp; &amp;nbsp; struct timespec __user *, utime, u32 __user *, uaddr2, u32, val3)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;
&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;&lt;/span&gt;&lt;br /&gt;
&amp;nbsp;-- just imagine if &lt;i&gt;mmap.2&lt;/i&gt; were barely documented and stale.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
There are two immediately obvious op flags that are not being exercised at all (with the exception of randomly bumping into them, which is quite unlikey and badly controllable):&lt;/div&gt;
&lt;ul style=&quot;text-align: justify;&quot;&gt;
&lt;li&gt;&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;FUTEX_CLOCK_RT:&lt;/span&gt; When set, the kernel treats the timeout as an absolute time based on &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;CLOCK_REALTIME&lt;/span&gt; as opposed to &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;CLOCK_MONOTONIC.&lt;/span&gt; This is only affected by &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;FUTEX_WAIT_BITSET&lt;/span&gt; and &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;FUTEX_REQUEUE_PI&lt;/span&gt; commands.&lt;/li&gt;
&lt;/ul&gt;
&lt;ul style=&quot;text-align: justify;&quot;&gt;
&lt;li&gt;&lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;FUTEX_PRIVATE_FLAG:&lt;/span&gt; Refers to the user address space mapping, and applies to all operations. The main benefit is that kernel can directly use the virtual address without having to do any lookups or other overhead (vmas, gup, thp, etc.) imposed by shared mappings.&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 style=&quot;text-align: justify;&quot;&gt;
Ever-changing task priorities&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The whole purpose of PI futexes are to address priority inheritance issues for systems with real time requirements. Randomly changing a processes priority will therefore better stress the system call instead of always using the default nice value, exercising priority boosting code in the kernel.&lt;/div&gt;
&lt;div style=&quot;text-align: left;&quot;&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
Fault/error injections&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This year we &lt;a href=&quot;https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ab51fbab39d864f3223e44a2600fd951df261f0b&quot;&gt;added&lt;/a&gt; support for artificially triggering errors within the various futex paths faults and deadlock scenarios, via the &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;CONFIG_FAULT_INJECTION&lt;/span&gt; kernel framework along with the &lt;span style=&quot;font-family: &amp;quot;courier new&amp;quot; , &amp;quot;courier&amp;quot; , monospace;&quot;&gt;CONFIG_FAIL_FUTEX&lt;/span&gt; option. Trinity can make use of this feature by randomly toggling the process&#39; &lt;a href=&quot;https://www.kernel.org/doc/Documentation/fault-injection/fault-injection.txt&quot;&gt;make-it-fail&lt;/a&gt; file as well as selecting appropriate fault injection debugfs options.&lt;/div&gt;
&lt;/div&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
Feeding user-addresses&lt;/h4&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Perhaps the single most important argument that we can pass to the syscall is the user address (uaddr, or &#39;the futex&#39;), which will govern everything the kernel attempts to do with it, being private or shared address space. As such, it is not very useful to blindly feed it random addresses, even if trinity is setup by default, these inputs will sometimes be picked by previously &lt;i&gt;mmap-created&lt;/i&gt; shared memory playgrounds. However, at a futex level, this does not matter unless we are doing blocking calls (WAIT).&lt;br /&gt;
&lt;br /&gt;
So this has been reworked such that trinity now creates a number of locks in shared memory at startup, which has the owner PID and the actual futex. Upon a call, both fields of uaddr get either a random lock or a random address from the mmap playground, each with a 50% chance. The locks follow very simple semantics, where a successful &lt;i&gt;cmpxchg&lt;/i&gt; will allow the caller to acquire the lock without the kernel being involved (fastpath), otherwise we need to wait/block through the futex call.&lt;br /&gt;
&lt;br /&gt;
Because of how trinity is structured with callbacks for pre/post syscall invocation, there are a number of racy windows between when the lock is dealt (ie considered contended) with and when the fuzzer actually calls futex(2). As such, this must be taken with a grain of salt, but does exercise lots of real world situations, nonetheless.&lt;br /&gt;
&lt;h4 style=&quot;text-align: left;&quot;&gt;
Choosing operations&lt;/h4&gt;
The idea is to randomly perform different operations on the selected futex, such that combinations of wake, wait, requeue are done (both for regular and PI futexes). While passing informed, &lt;i&gt;not-so-random,&lt;/i&gt; parameters to the system call reduces the chance of shallow fuzzing, choosing the futex operation will determine the kind of work to be done on the uaddress. As such this part can further determine the usefulness of trinity regarding futexes. However, one cannot get too strict here as reducing the randomness will also limit the usefulness. For now the layout is a 25% chance when performing lock operations. Oh the other hand, for the case of mmap selected uaddress, the operation is left up to trinity to decide.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;h3 style=&quot;text-align: justify;&quot;&gt;
Evaluation and future work&lt;/h3&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Evaluating software that purposely tries to mess up other software is always twofold. For one, any new futex bug that is found indicates that modifying trinity was a good step towards better testing coverage. But unfortunately this creates a new headache for futex hackers, and a bug needs to be fixed (including any corresponding Linux distribution backporting, security and &lt;i&gt;-stable&lt;/i&gt; work). So any useful results which exhibit the presence of bugs can be bitter/sweet -- just think &lt;a href=&quot;http://www.brainyquote.com/quotes/quotes/e/edsgerdijk201165.html&quot;&gt;Dijkstra&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
One immediate way of evaluating the changes to trinity is to see the number of successful calls. While this can be a misleading metric, it does at least indicate whether or not many of the bogus parameter passing have been mitigated and replaced with smarter, more informed calls. Tests show that these changes have in fact boosted the amount of successful futex(2) returns; within a trinity run of 10,000 calls with 4 threads, we were able to go from ~470 to nearly ~4300, which is around a 10x improvement. This also means that it takes more time to run trinity as the kernel is doing actual work now with our futexes, not simply returning immediately due to bogus parameters and trivial error checks.&lt;br /&gt;
&lt;br /&gt;
In the future, it would be good to fuzz futexes with memory-back file (uaddress), instead of always relying on anonymous memory. While is perhaps not so interesting from a futex standpoint (with the exception of hashing), it would be good when combining with other memory related calls which actually do things with the file. Another useful direction would be to further investigate operation selection policies. Different models will fuzz different parts of the futex subsystem, and perhaps (very probably, actually) I have not found the best one yet.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This work was done as part of SUSE &lt;a href=&quot;https://hackweek.suse.com/13/projects/1064&quot;&gt;Hackweek 13&lt;/a&gt;, which allowed me to finally allocate some time to focus on this (although this writing is much overdue). So as always, lots of thanks to my employer.&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/3926339682891681060/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2015/12/fuzztex-targeted-fuzzing-of-futexes.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3926339682891681060'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3926339682891681060'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2015/12/fuzztex-targeted-fuzzing-of-futexes.html' title='fu(zz)tex: targeted fuzzing of futexes'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-3042473723192220095</id><published>2015-10-04T23:52:00.000-07:00</published><updated>2015-10-04T23:54:46.111-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="architecture"/><category scheme="http://www.blogger.com/atom/ns#" term="barriers"/><category scheme="http://www.blogger.com/atom/ns#" term="concurrency"/><category scheme="http://www.blogger.com/atom/ns#" term="cpu"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="load acquire"/><category scheme="http://www.blogger.com/atom/ns#" term="memory model"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="SMP"/><category scheme="http://www.blogger.com/atom/ns#" term="store release"/><category scheme="http://www.blogger.com/atom/ns#" term="synchronization"/><title type='text'>acquire/release semantics in the kernel</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
With the need for better scaling on increasingly larger multi-core 
systems, we&#39;ve continued to extend our CPU barriers in the kernel. Two important variants to prevent CPU reordering for lock-free shared memory synchronization are pairs of &lt;i&gt;load/acquire&lt;/i&gt; and &lt;i&gt;store/release&lt;/i&gt; &lt;a href=&quot;https://lwn.net/Articles/576486/&quot;&gt;barriers&lt;/a&gt;; also known as &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;LOCK/UNLOCK&lt;/span&gt; barriers. These enable threads to cooperate between each other.&lt;br /&gt;
&lt;br /&gt;
Multiple, yet pretty much equivalent, definitions of acquire/release semantics can be found all over the internet, but I like the version from the infamous &lt;i&gt;&#39;Documentation/memory-barriers.txt&#39;&lt;/i&gt; file for three reasons: (i) it is clear and concise, (ii) it explicitly warns that they are the minimum operations and not to assume anything about reordering of loads and stores before or after the acquire or release, respectively. Finally, (iii) it strongly mentions the need for pairing and thus portability:&lt;/div&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: &amp;quot;Trebuchet MS&amp;quot;,sans-serif;&quot;&gt;&amp;nbsp;&lt;i&gt;(5) ACQUIRE operations.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; This acts as a one-way permeable barrier.&amp;nbsp; It guarantees that all memory operations after the ACQUIRE operation will appear to happen after the CQUIRE operation with respect to the other components of the system. ACQUIRE operations include LOCK operations and smp_load_acquire() operations.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory operations that occur before an ACQUIRE operation may appear tohappen after it completes.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; An ACQUIRE operation should almost always be paired with a RELEASE operation.&lt;/i&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;i&gt;&lt;span style=&quot;font-family: &amp;quot;Trebuchet MS&amp;quot;,sans-serif;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/i&gt;&lt;/div&gt;
&lt;i&gt;&lt;span style=&quot;font-family: &amp;quot;Trebuchet MS&amp;quot;,sans-serif;&quot;&gt;&amp;nbsp;(6) RELEASE operations.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; This also acts as a one-way permeable barrier.&amp;nbsp; It guarantees that all&amp;nbsp;&amp;nbsp; memory operations before the RELEASE operation will appear to happen before the RELEASE operation with respect to the other components of the system. RELEASE operations include UNLOCK operations and smp_store_release() operations.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory operations that occur after a RELEASE operation may appear to happen before it completes.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; The use of ACQUIRE and RELEASE operations generally precludes the need for other sorts of memory barrier (but note the exceptions mentioned in the subsection &quot;MMIO write barrier&quot;).&amp;nbsp; In addition, a RELEASE+ACQUIRE pair is -not- guaranteed to act as a full memory barrier.&amp;nbsp; However, after an ACQUIRE on a given variable, all memory accesses preceding any prior RELEASE on that same variable are guaranteed to be visible.&amp;nbsp; In other words, within a given variable&#39;s critical section, all accesses of all previous critical sections for that variable are guaranteed to have completed.&lt;br /&gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; This means that ACQUIRE acts as a minimal &quot;acquire&quot; operation and&amp;nbsp;&amp;nbsp;&amp;nbsp; RELEASE acts as a minimal &quot;release&quot; operation.&lt;/span&gt;&lt;/i&gt;&lt;/blockquote&gt;
&lt;table align=&quot;center&quot; cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;float: left; margin-right: 1em; text-align: left;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjXD54PB-3sO1A0ORDsL9rU-pRDP4fTOLvzYs7MX_8X8gj8YVnr9Xbl8cvlvjMMzRC_jAQeIqgrVDg81jUXa4-xwWOn5TL2lCzSotQFmPE5f_OB1xWWZFqixkPhmQJkcaYdfjpDLod5bD9O/s1600/acquire-release.png&quot; style=&quot;margin-left: auto; margin-right: auto;&quot; /&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;Thread B&#39;s ACQUIRE pairs with Thread A&#39;s RELEASE. &lt;a href=&quot;http://www.ibm.com/developerworks/library/j-jtp03304/&quot;&gt;Copyright&lt;/a&gt; (C) IBM.&lt;/td&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;&lt;br /&gt;&lt;/td&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;&lt;br /&gt;&lt;/td&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;&lt;br /&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
In lock-speak, all this means is that nothing leaks from the critical region that is protected by the primitive in question. A thread attempting to take a lock will synchronize/pair the load (ACQUIRE), for instance via &lt;i&gt;Rmw&lt;/i&gt; (&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;cmpxchg&lt;/span&gt;), when attempting to take the lock with the last store (RELEASE) when another thread is concurrently releasing the lock (for example, setting the counter to 0).&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
For v4.2, Will Deacon &lt;a href=&quot;https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=654672d4ba1a6001c365833be895f9477c4d5eab&quot;&gt;introduced&lt;/a&gt; more relaxed extensions of traditional atomic operations (including &lt;i&gt;Rmw&lt;/i&gt;) which allow finer grained control over, what used to be, full barriers semantics on both sides of the instruction. This is also true for just about all atomic functions that return a value to the caller, ie: &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;atomic_*_return()&lt;/span&gt;. As such &lt;a href=&quot;http://preshing.com/20120930/weak-vs-strong-memory-models/&quot;&gt;weakly ordered architectures&lt;/a&gt; can make use of these -- currently only arm64 makes use of them, but &lt;a href=&quot;https://lkml.org/lkml/2015/9/16/527&quot;&gt;efforts&lt;/a&gt; for PPC are being made. &lt;/div&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;blockquote class=&quot;tr_bq&quot;&gt;
&lt;i&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - *_relaxed: No ordering guarantees. This is similar to what we have already for the non-return atomics (e.g. atomic_add).&lt;br /&gt;&amp;nbsp;&amp;nbsp; &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - *_acquire: ACQUIRE semantics, similar to smp_load_acquire.&lt;br /&gt;&amp;nbsp;&amp;nbsp; &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - *_release: RELEASE semantics, similar to smp_store_release.&lt;/i&gt;&lt;/blockquote&gt;
&lt;/blockquote&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
So we now have goodies such as &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;atomic_cmpxchg_acquire()&lt;/span&gt; or &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;atomic_add_return_relaxed()&lt;/span&gt;. Most recently, aiming for v4.4, &lt;a href=&quot;http://comments.gmane.org/gmane.linux.kernel/2050980&quot;&gt;I&#39;ve ported all our locks&lt;/a&gt; to make use of these optimizations, which can save almost half the 
amount of barriers in the kernel&#39;s locking code -- which is specially nice under low or regular contention scenarios, 
where the fastpaths are exercised. There are plenty of other examples of real world code making use of acquire/release semantics. Mostly by using &lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;smp_load_acquire()/smp_store_release()&lt;/span&gt; other primitives&amp;nbsp; &lt;span style=&quot;font-family: inherit;&quot;&gt;also use these semantics for common building blocks &lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;(as esoteric as they can get, ie RCU).&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/3042473723192220095/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2015/10/acquirerelease-semantics-in-kernel.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3042473723192220095'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/3042473723192220095'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2015/10/acquirerelease-semantics-in-kernel.html' title='acquire/release semantics in the kernel'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjXD54PB-3sO1A0ORDsL9rU-pRDP4fTOLvzYs7MX_8X8gj8YVnr9Xbl8cvlvjMMzRC_jAQeIqgrVDg81jUXa4-xwWOn5TL2lCzSotQFmPE5f_OB1xWWZFqixkPhmQJkcaYdfjpDLod5bD9O/s72-c/acquire-release.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-7433466098575674607</id><published>2015-08-24T07:34:00.000-07:00</published><updated>2015-12-30T06:31:49.922-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="conference"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="LPC 2015"/><category scheme="http://www.blogger.com/atom/ns#" term="operating systems"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><category scheme="http://www.blogger.com/atom/ns#" term="plumbers"/><category scheme="http://www.blogger.com/atom/ns#" term="research"/><category scheme="http://www.blogger.com/atom/ns#" term="scalability"/><title type='text'>LPC 2015: Performance and Scalability MC</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-size: small;&quot;&gt;This year I had the privilege of &lt;span style=&quot;font-family: inherit;&quot;&gt;leading the&lt;span style=&quot;font-family: inherit;&quot;&gt; &lt;/span&gt;&lt;/span&gt;&lt;a href=&quot;https://www.linuxplumbersconf.org/2015/ocw/events/LPC2015/tracks/453&quot;&gt;Performance and Scalability&lt;/a&gt; micro-conference for Linux Plumbers. The goals and motivation behind &lt;span style=&quot;font-family: inherit;&quot;&gt;organi&lt;span style=&quot;font-family: inherit;&quot;&gt;zing&lt;/span&gt;&lt;/span&gt; this track were threefold. First present relevant work-in-progress ideas that can improve performance in core kernel subsystems, and need some face to face discussion -- as such, this requires previous debate on lkml. Similarly, learn about real bottlenecks and issues people are running into. And finally, get to know more relevant academic (experimental) work going on in in both the kernel and system-level userland. As such, the sessions were grouped as follows:&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;i&gt;(i)&lt;/i&gt; &lt;a href=&quot;http://backtrace.io/blog/blog/2015/03/13/workload-specialization/&quot;&gt;Fast Bounded-Concurrency Hash Tables&lt;/a&gt;. &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;Samy &lt;span style=&quot;font-family: inherit;&quot;&gt;B&lt;/span&gt;ahra introduced a novel non-blocking multi-reader/single writer hash table with strong forward&lt;/span&gt; &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;progress guarantees for TSO. Because the common-case fastpath does not incur &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;in barriers or atomic operations&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;, this technique &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;allows nearly &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;perfect scaling&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;. While his work is done in userspace, he sees potential &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;for it in the kernel, such as the networking subsystem. In such situations, the use of RCU (readers being the common case) might also be used.&lt;/span&gt;&lt;/span&gt;&lt;br style=&quot;color: #2e3436; font-size: 15px; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;br style=&quot;color: #2e3436; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;i&gt;(ii)&lt;/i&gt; &lt;a href=&quot;http://linuxplumbersconf.org/2015/ocw//system/presentations/2913/original/mcs_tsx.pdf&quot;&gt;Improving Transactional Memory Performance with Queued Locking&lt;/a&gt;. &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;While transactional memory&amp;nbsp; works nicely in conflict-free setups, it ends up requiring common &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;serialization otherwise. An option is to ret&lt;span style=&quot;font-family: inherit;&quot;&gt;r&lt;span style=&quot;font-family: inherit;&quot;&gt;y, however, when &lt;span style=&quot;font-family: inherit;&quot;&gt;the amount o&lt;span style=&quot;font-family: inherit;&quot;&gt;f threads executing in the CR is larger than the&lt;span style=&quot;font-family: inherit;&quot;&gt; amount of completed threads, you can get pile&lt;span style=&quot;font-family: inherit;&quot;&gt;ups. &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;Tim Chen &lt;span style=&quot;font-family: inherit;&quot;&gt;p&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;resented a solution based on applying a sort of &#39;aperture&#39; and using &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;principles based on MCS for faired queuing, &lt;span style=&quot;font-family: inherit;&quot;&gt;where &lt;/span&gt;can be regulated based on &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;metrics such as the number of threads in the critical region a&lt;span style=&quot;font-family: inherit;&quot;&gt;nd abort ra&lt;span style=&quot;font-family: inherit;&quot;&gt;te&lt;/span&gt;&lt;/span&gt;.&lt;/span&gt;&lt;br style=&quot;color: #2e3436; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;br style=&quot;color: #2e3436; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;i&gt;(iii)&lt;/i&gt; &lt;a href=&quot;https://linuxplumbersconf.org/2015/ocw/proposals/2751&quot;&gt;How to &lt;span style=&quot;font-family: inherit;&quot;&gt;Apply Mutation Testing to RCU&lt;/span&gt;&lt;/a&gt;&lt;/span&gt;. &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;Iftekhar Ahmed from OSU&lt;span style=&quot;font-family: inherit;&quot;&gt;, &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;summarized his research in overcoming limitations of mutation&lt;/span&gt; &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;testing to identify problems in RCU. As usual, working with Paul Mc&lt;span style=&quot;font-family: inherit;&quot;&gt;Kenney&lt;/span&gt;, they &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;have been able to identify a number of mutants along with making use of &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;rcutorture for specific periods of time. They generated ~3300 mutants &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;from rcu and rcutorture is doing a good job identifying them. It w&lt;span style=&quot;font-family: inherit;&quot;&gt;ould be interesting to see this applied&lt;span style=&quot;font-family: inherit;&quot;&gt; along with f&lt;span style=&quot;font-family: inherit;&quot;&gt;uzzy test&lt;span style=&quot;font-family: inherit;&quot;&gt;ing which has &lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;already&lt;span style=&quot;font-family: inherit;&quot;&gt; &lt;/span&gt;uncovered several &lt;span style=&quot;font-family: inherit;&quot;&gt;bugs in RCU&lt;span style=&quot;font-family: inherit;&quot;&gt; in the past.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;table cellpadding=&quot;0&quot; cellspacing=&quot;0&quot; class=&quot;tr-caption-container&quot; style=&quot;margin-left: auto; margin-right: auto; text-align: center;&quot;&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style=&quot;text-align: center;&quot;&gt;&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjIrBy9WJ_TY9_2-G42U8oUl38EMktviaEfu_Uuy1Scdn7s53Jyc1gbaPeAkCFqALT8PBLxSe8NWwIkgNrgMfkiCgNBiiobLbP6onJAwOIEk_UzB8m5McAdgYsGy6LAOrHBbWqFaLGtAG55/s1600/20805464645_410a3218b0_k.jpg&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: auto; margin-right: auto;&quot;&gt;&lt;img border=&quot;0&quot; height=&quot;300&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjIrBy9WJ_TY9_2-G42U8oUl38EMktviaEfu_Uuy1Scdn7s53Jyc1gbaPeAkCFqALT8PBLxSe8NWwIkgNrgMfkiCgNBiiobLbP6onJAwOIEk_UzB8m5McAdgYsGy6LAOrHBbWqFaLGtAG55/s400/20805464645_410a3218b0_k.jpg&quot; width=&quot;400&quot; /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;Scaling track -- LPC&#39;15, Seattle.&lt;/td&gt;&lt;td class=&quot;tr-caption&quot; style=&quot;text-align: center;&quot;&gt;&lt;br /&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: justify;&quot;&gt;
&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;i&gt;(iv)&lt;/i&gt; &lt;a href=&quot;https://linuxplumbersconf.org/2015/ocw/proposals/3291&quot;&gt;Unfair Qu&lt;/a&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;a href=&quot;https://linuxplumbersconf.org/2015/ocw/proposals/3291&quot;&gt;eued Spinlocks and Transactional Locks&lt;/a&gt;.&lt;/span&gt; &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;Waiman Long has been working on extending &lt;span style=&quot;font-family: inherit;&quot;&gt;spinlocks&lt;/span&gt; and apply &lt;span style=&quot;font-family: inherit;&quot;&gt;them&lt;/span&gt;&lt;/span&gt; &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;to solve issues with &lt;span style=&quot;font-family: inherit;&quot;&gt;transactional memory.&lt;/span&gt; He presented experiments based on rwlocks and &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;transactional spinlock (new primitive) for transactional (reader) and &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;non-transactional (writer) executions. This talk &lt;span style=&quot;font-family: inherit;&quot;&gt;nicely complemented&lt;/span&gt; Tim Chen&lt;span style=&quot;font-family: inherit;&quot;&gt;&#39;s previous presenta&lt;span style=&quot;font-family: inherit;&quot;&gt;tion&lt;/span&gt;.&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt; He also touched on the qspinlock performance in virtualized&lt;/span&gt; &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;environments and the challenges currently out there. &lt;span style=&quot;font-family: inherit;&quot;&gt;As we&lt;span style=&quot;font-family: inherit;&quot;&gt; alrea&lt;span style=&quot;font-family: inherit;&quot;&gt;dy have code for this,&lt;span style=&quot;font-family: inherit;&quot;&gt; it was much easier to discuss face to face. &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;Consensus in the &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;room was that kernel developers are not against improving pv spinlocks, &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;but what is determined is that we will not accept a 3rd primitive.&lt;/span&gt;&lt;br style=&quot;color: #2e3436; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;br style=&quot;color: #2e3436; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot; /&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;i&gt;(v)&lt;/i&gt; &lt;a href=&quot;https://sslab.gtisc.gatech.edu/2015/cloud-scalability.html&quot;&gt;Do Virtual Machines Really Scale&lt;/a&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;. &lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;Sanidhya Kashyap&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;a href=&quot;https://sslab.gtisc.gatech.edu/author/sanidhya-kashyap.html&quot;&gt;&lt;/a&gt;
from GA&lt;span style=&quot;font-family: inherit;&quot;&gt; Tech&lt;/span&gt; showed us the state of scalability in the cloud where there is &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;a clear trend that services hit poor scalability after certain degrees &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;of contention/core-count. These are LHP issues and vmexits/enters cause &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;performance issues at high vcpu counts. He introduces oticket backed by &lt;/span&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;performing multiple wakeups at once when granting the lock. Good&lt;/span&gt; &lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;feedback and suggestions to overcome some of the presented issues&lt;span style=&quot;font-family: inherit;&quot;&gt; with the approach&lt;span style=&quot;font-family: inherit;&quot;&gt;. This was a&lt;span style=&quot;font-family: inherit;&quot;&gt;n extra short BoF like of presentation, but &lt;span style=&quot;font-family: inherit;&quot;&gt;there was quite a bi&lt;span style=&quot;font-family: inherit;&quot;&gt;t of interest, and the &lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;appropriate&lt;/span&gt; people&lt;/span&gt;&lt;/span&gt; were in the&lt;span style=&quot;font-family: inherit;&quot;&gt; room.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-size: small;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;background-color: white; color: #2e3436; display: inline ! important; float: none; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;Overall I would say that all thr&lt;span style=&quot;font-family: inherit;&quot;&gt;e&lt;span style=&quot;font-family: inherit;&quot;&gt;e &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;objectives&lt;/span&gt; were met and the quality of the sessions were high, thus &lt;span style=&quot;font-family: inherit;&quot;&gt;meeting all expectations&lt;/span&gt; &lt;span style=&quot;font-family: inherit;&quot;&gt;(if not, please email me for feedback&lt;span style=&quot;font-family: inherit;&quot;&gt; ;-)&lt;/span&gt;&lt;/span&gt;. &lt;span style=&quot;font-family: inherit;&quot;&gt;In fact, there were some highly interesting and &lt;span style=&quot;font-family: inherit;&quot;&gt;relevant &lt;/span&gt;presentations t&lt;span style=&quot;font-family: inherit;&quot;&gt;hat, due to t&lt;span style=&quot;font-family: inherit;&quot;&gt;i&lt;span style=&quot;font-family: inherit;&quot;&gt;me constraints&lt;span style=&quot;font-family: inherit;&quot;&gt;, had to &lt;span style=&quot;font-family: inherit;&quot;&gt;be left out&lt;span style=&quot;font-family: inherit;&quot;&gt;.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/7433466098575674607/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2015/08/lpc-2015-performance-and-scalability-mc.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7433466098575674607'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7433466098575674607'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2015/08/lpc-2015-performance-and-scalability-mc.html' title='LPC 2015: Performance and Scalability MC'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjIrBy9WJ_TY9_2-G42U8oUl38EMktviaEfu_Uuy1Scdn7s53Jyc1gbaPeAkCFqALT8PBLxSe8NWwIkgNrgMfkiCgNBiiobLbP6onJAwOIEk_UzB8m5McAdgYsGy6LAOrHBbWqFaLGtAG55/s72-c/20805464645_410a3218b0_k.jpg" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-976495088603038952</id><published>2014-01-20T13:32:00.002-08:00</published><updated>2014-01-22T19:49:26.260-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="algorithms"/><category scheme="http://www.blogger.com/atom/ns#" term="C"/><category scheme="http://www.blogger.com/atom/ns#" term="contention"/><category scheme="http://www.blogger.com/atom/ns#" term="data structures"/><category scheme="http://www.blogger.com/atom/ns#" term="hash tables"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="numa"/><category scheme="http://www.blogger.com/atom/ns#" term="operating systems"/><category scheme="http://www.blogger.com/atom/ns#" term="performance"/><title type='text'>futexes and hash table collisions</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Hash tables are popular data structures that efficiently handle dictionary operations (search and insert/delete). The &lt;b&gt;Linux kernel&lt;/b&gt; relies on them for a number of subsystems, including major core kernel areas, such as dcache/inode lookups, workqueues, timers, the PID table, TCP/UDP and futexes. This last being used as common building blocks for implementing userspace locking primitives, &lt;i&gt;pthreads&lt;/i&gt; being, perhaps, the most popular user.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;Futexes make use of single, &lt;a href=&quot;http://en.wikipedia.org/wiki/Hash_table#Separate_chaining&quot;&gt;chained&lt;/a&gt;, hash table. The user space address (&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;uaddr&lt;/span&gt;) is used by the kernel to generate a unique &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;futex_key&lt;/span&gt; to reference the futex. Each key is hashed to a bucket  (&lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;hb&lt;/span&gt;), which contains a single priority based linked list -- real-time tasks are queued in front of regular tasks, otherwise ordered as FIFO. To synchronize updates to the list, a &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;hb-&amp;gt;lock&lt;/span&gt; spinlock is used. N&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;ote that collisions can occur where similar user addresses can hash to the same futex key, so 
a single list can contain tasks blocked on different futexes. &lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;There are a total of &lt;b&gt;256 hash buckets&lt;/b&gt; in the entire table. For a much more thorough futex architectural overview, refer to: &lt;/span&gt;&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href=&quot;http://lwn.net/Articles/360699/&quot;&gt;A futex overview and update&lt;/a&gt;. Darren Hart, LWN.net. Nov, 2009.&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;http://www.akkadia.org/drepper/futex.pdf&quot;&gt;Futexes Are Tricky&lt;/a&gt;. Ulrich Drepper. Nov, 2011.&lt;/li&gt;
&lt;li&gt;&lt;a href=&quot;https://lwn.net/images/conf/rtlws11/papers/proc/p10.pdf&quot;&gt;Requeue-PI: Making Glibc Condvars PI-Aware&lt;/a&gt;. Darren Hart, Dinakar Guniguntala.&lt;/li&gt;
&lt;/ul&gt;
&lt;br /&gt;
Operations on futexes can be classified as putting a task to sleep/block to &lt;b&gt;wait&lt;/b&gt; on a futex, or, the opposite, &lt;b&gt;wake&lt;/b&gt; up one or more blocked tasks. Both commands make use of the architecture (very briefly) described above. Of course, each of these operations require hashing the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;uaddr&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;,&lt;/span&gt; and thus taking the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;hb-&amp;gt;lock&lt;/span&gt; to access the list.&lt;br /&gt;
&lt;br /&gt;
&lt;h3&gt;
Bottlenecks&lt;/h3&gt;
The size of the hash table is evidently a major bottleneck in today&#39;s systems. Large systems, using many futexes, can be prone to high amounts of 
collisions; where these futexes hash and therefore lead to 
extra contention on the same &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;hb-&amp;gt;lock&lt;/span&gt;. Furthermore,
 cacheline bouncing occurs when we have multiple hash bucket 
spinlocks residing on the same cacheline and different futexes hash to 
adjacent buckets. If tasks operating on different futexes that are on 
the same list, the lock will become contended really fast.&lt;br /&gt;
&lt;br /&gt;
In addition, the entire hash table is allocated on a single NUMA node, which creates remote node memory accesses. As systems become more powerful, having NUMA aware algorithms and data structures is paramount to take advantage of today&#39;s hardware trends. accessing the hash table from remote NUMA nodes can lead higher memory 
latencies. &lt;br /&gt;
&lt;br /&gt;
&lt;h3&gt;
Optimizations &amp;amp; Results&lt;/h3&gt;
Upstream commit &lt;a href=&quot;http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a52b89ebb6d4499be38780db8d176c5d3a6fbc17&quot;&gt;a52b89eb &lt;/a&gt;deals with both bottlenecks. The hash table now contains 256 hash buckets per CPU as well as being NUMA aware. There was also some &lt;a href=&quot;https://lkml.org/lkml/2013/12/1/43&quot;&gt;discussion&lt;/a&gt; on scaling the table up by RAM as well, and furthermore hashing on the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;uaddr&lt;/span&gt;&#39;s page node, thus reducing the cost of collisions. However this cannot be done as the pages can move between nodes at any point. In addition to enlarging the table, cacheline aligning the hash bucket structure also provided a nice optimization, as it avoids accesses across cacheline boundaries. The figure below (higher is better) shows the throughput of &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;uaddr&lt;/span&gt; hashing for the different optimizations, where each thread operates on 1024 futexes.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;div class=&quot;separator&quot; style=&quot;clear: both; text-align: center;&quot;&gt;
&lt;a href=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjOUycv7Qjkj0yuDoW6DT9pGTups_6PWBFSsKYoUpEeSfp-h1DXs4PGMcbb9wAhseyw0GpYTapn24Mk58lNGcDavL9AzjAAdZKC9vQV5EPWyDVtXcSoKKp6uO4lGj6ZZjWWmN6hhZGokbsj/s1600/futex-uaddr-hash-scaling.png&quot; imageanchor=&quot;1&quot; style=&quot;margin-left: 1em; margin-right: 1em;&quot;&gt;&lt;img border=&quot;0&quot; src=&quot;https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjOUycv7Qjkj0yuDoW6DT9pGTups_6PWBFSsKYoUpEeSfp-h1DXs4PGMcbb9wAhseyw0GpYTapn24Mk58lNGcDavL9AzjAAdZKC9vQV5EPWyDVtXcSoKKp6uO4lGj6ZZjWWmN6hhZGokbsj/s1600/futex-uaddr-hash-scaling.png&quot; height=&quot;412&quot; width=&quot;640&quot; /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Combining both cacheline aligning and larger, NUMA table provides the best results -- each percentage increase is added to the final value. As more futexes are dealt with, the more clear the benefits, with speedups from 78% to 800%.&lt;br /&gt;
&lt;br /&gt;
Of course, performance goes down as more futexes are added to the equation. This is unavoidable given the overall architectural designs that govern futexes. Hashing on 512 threads isn&#39;t as fast as on 32 threads, but the proportion of &lt;i&gt;baseline&lt;/i&gt; and &lt;i&gt;both&lt;/i&gt; clearly becomes larger.&lt;br /&gt;
&lt;br /&gt;
Another recent improvement is dealing with smarter wake-ups. Commit &lt;a href=&quot;http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b0c29f79ecea0b6fbcefc999e70f2843ae8306db&quot;&gt;b0c29f79&lt;/a&gt; avoids taking the &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;hb-&amp;gt;lock&lt;/span&gt; when there are not tasks waiting on the futex -- thus a free ride for &lt;span style=&quot;font-family: &amp;quot;Courier New&amp;quot;,Courier,monospace;&quot;&gt;futex(2)&lt;/span&gt; calls returning 0. This extends the parallelism of futexes, allowing other calls to be processed concurrently instead of wasting time spinning on a potentially contended spinlock.&lt;br /&gt;
&lt;br /&gt;
These optimizations will be included in Linux 3.14. &lt;br /&gt;
&lt;br /&gt;
Special thanks to, among others, Thomas Gleixner, Darren Hart and Peter Zijlstra for entertaining discussion and taking the time to review this work.&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/976495088603038952/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2014/01/futexes-and-hash-table-collisions.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/976495088603038952'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/976495088603038952'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2014/01/futexes-and-hash-table-collisions.html' title='futexes and hash table collisions'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjOUycv7Qjkj0yuDoW6DT9pGTups_6PWBFSsKYoUpEeSfp-h1DXs4PGMcbb9wAhseyw0GpYTapn24Mk58lNGcDavL9AzjAAdZKC9vQV5EPWyDVtXcSoKKp6uO4lGj6ZZjWWmN6hhZGokbsj/s72-c/futex-uaddr-hash-scaling.png" height="72" width="72"/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-5144655601428418388</id><published>2013-09-28T13:43:00.000-07:00</published><updated>2014-01-18T21:23:50.884-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="efi"/><category scheme="http://www.blogger.com/atom/ns#" term="gpt"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="master boot record"/><category scheme="http://www.blogger.com/atom/ns#" term="mbr"/><category scheme="http://www.blogger.com/atom/ns#" term="partition tables"/><title type='text'>Detecting Hybrid MBRs in the Linux Kernel</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
EFI&#39;s GPT disklabels present a number of benefits to the traditional MBR scheme. For instance, not having to deal with CHS addressing, better data integrity (including a backup header as data redundancy) and 64bit LBA addressing, allowing partitions to go beyond the 2Tb limit all the way up to 9.4 Zb.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
These nice features don&#39;t come free, however, having to deal with older legacy systems (normally BIOS-based) that only use MBR, and do not know about GPT. For example, users&amp;nbsp; who have an EFI system (say a Mac), dual booting with an older, non-EFI version of Windows. While OSX knows GPT and uses the GPT partition(s), Windows doesn&#39;t, so you cannot dual boot without creating a &lt;a href=&quot;http://www.rodsbooks.com/gdisk/hybrid.html&quot;&gt;hybrid MBR&lt;/a&gt; - the standard&lt;a href=&quot;http://en.wikipedia.org/wiki/GUID_Partition_Table&quot;&gt; protective MBR&lt;/a&gt; (pMBR) won&#39;t allow Windows to boot. This hybrid MBR will extend the regular pMBR (containing a 0xEE GPT partition) so that it contains up to three primary partitions that point to the same disk locations that the GPT partitions point to. Hybrid MBRs are &lt;u&gt;unofficial&lt;/u&gt; workarounds to the GPT specs, but necessary for backward compatibility. Furthermore, most bootloaders are now acknowledging this kind of scheme.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
In order for Linux to properly discover protective MBRs, it must be made
aware of devices that have hybrid MBRs. To this end, Linux v3.12 will now be able to &lt;a href=&quot;http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b05ebbbbeb67a420d06567c6b9618a9e644d6104&quot;&gt;detect&lt;/a&gt; these partitioning schemes.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Furthermore, the kernel &lt;a href=&quot;http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3e69ac344007bec5e3987ac86619e140fbc79b72&quot;&gt;will no longer require&lt;/a&gt; the GPT partition to begin at sector 1, enabling Linux to be more flexible when
probing for GPT disklabels. Linux was the only OS that enforced this, and apart from it not being enforced by UEFI, it caused Linux to potentially
fail to detect &lt;b&gt;valid&lt;/b&gt; partitions on the disk. 
For compatibility reasons, if the first partition is hybridized, the 0xEE
partition must be small enough to ensure that it only protects the GPT
data structures - as opposed to the the whole disk in a protective MBR. Note that these changes do not affect already existing partitions. &lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/5144655601428418388/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2013/09/detecting-hybrid-mbrs-in-linux-kernel.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5144655601428418388'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5144655601428418388'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2013/09/detecting-hybrid-mbrs-in-linux-kernel.html' title='Detecting Hybrid MBRs in the Linux Kernel'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-1743149866934670924</id><published>2012-10-08T07:09:00.002-07:00</published><updated>2012-10-08T11:50:15.583-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="conference"/><category scheme="http://www.blogger.com/atom/ns#" term="critique"/><category scheme="http://www.blogger.com/atom/ns#" term="foss.in"/><category scheme="http://www.blogger.com/atom/ns#" term="india"/><title type='text'>FOSS.IN organization team critique</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Back in June I submitted a talk to &lt;a href=&quot;http://foss.in/&quot;&gt;FOSS.IN&lt;/a&gt;&amp;nbsp;2012 conference in Bangalore, India. Unfortunately my talk was not included in the list of accepted proposals, in&amp;nbsp;other words, it was rejected. But that&#39;s not the reason why I&#39;m writing, or why I am most&amp;nbsp;&lt;b&gt;disappointed&lt;/b&gt;&amp;nbsp;in how things were handled by FOSS.IN&#39;s organizing team.&amp;nbsp;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The program&#39;s &lt;a href=&quot;http://foss.in/participate/call-for-participation&quot;&gt;call for participation&lt;/a&gt;&amp;nbsp;stated that the list of accepted proposals would be published by August 6th. This, however, did not occur until &lt;a href=&quot;http://foss.in/2012/take-one-speakers-at-foss-in2012&quot;&gt;two months later&lt;/a&gt;, in early October. Working in academia, I am well aware that conference dates and deadlines can be changed, and one gets used to this, and takes it with a grain of salt. What I cannot understand, or accept, is the fact that FOSS.IN did not bother to inform anyone (specially those of us who took the time to submit a talk) that the deadlines were not going to be kept. A delay of two months is already incredible, not to mention the total lack of information as to when the accepted talks would be published. I had never seen such a thing from a conference, and hope to never see it again.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Furthermore, I was not even informed that my talk had been rejected. FOSS.IN already has an automated system to send people emails, I got one confirming my submission. So why wasn&#39;t I notified? Automatic emails are easy, fast and free.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Submitting a good talk to a conference takes time and careful preparation. I would expect a minimal amount of courtesy and professionalism by FOSS.IN.&amp;nbsp;People make plans around deadlines and it&#39;s&amp;nbsp;extremely&amp;nbsp;rude to keep them in the dark.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This kind of&amp;nbsp;behavior&amp;nbsp;is simply&amp;nbsp;&lt;b&gt;unacceptable&lt;/b&gt;&amp;nbsp;and makes the entire conference look bad. Yes, it&#39;s quite a big and well known event within the free software community, but that&#39;s not a justification.&amp;nbsp;I am aware of Atul Chitnis&#39;&amp;nbsp;&lt;a href=&quot;http://atulchitnis.net/2012/public-statement-about-my-health/&quot;&gt;condition&lt;/a&gt;&amp;nbsp;and wish him all the best and hope he can overcome the illness. I was very sorry to learn about it. But&amp;nbsp;FOSS.IN is not a one man job.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
I am writing this as a &lt;b&gt;constructive&amp;nbsp;criticism&lt;/b&gt;, hoping that these unpleasant things do not reoccur in future events. Since this year was the first time I&amp;nbsp;proposed&amp;nbsp;a presentation I don&#39;t know if these issues were a one time thing or is what people have come to expect from the FOSS.IN teams. Still I wish the best of luck to you folks and hope that you have another great conference this year.&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/1743149866934670924/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/10/fossin-organization-team-critique.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1743149866934670924'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1743149866934670924'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/10/fossin-organization-team-critique.html' title='FOSS.IN organization team critique'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-34931413066027668</id><published>2012-09-27T14:15:00.000-07:00</published><updated>2012-09-28T07:35:28.257-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="block devices"/><category scheme="http://www.blogger.com/atom/ns#" term="C programming"/><category scheme="http://www.blogger.com/atom/ns#" term="development"/><category scheme="http://www.blogger.com/atom/ns#" term="dos"/><category scheme="http://www.blogger.com/atom/ns#" term="efi"/><category scheme="http://www.blogger.com/atom/ns#" term="fdisk"/><category scheme="http://www.blogger.com/atom/ns#" term="google summer of code"/><category scheme="http://www.blogger.com/atom/ns#" term="gpt"/><category scheme="http://www.blogger.com/atom/ns#" term="labels"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="partition tables"/><category scheme="http://www.blogger.com/atom/ns#" term="sun"/><category scheme="http://www.blogger.com/atom/ns#" term="util-linux"/><title type='text'>fdisk updates and GPT support</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;background-color: white; color: #333333; line-height: 20px; text-align: -webkit-auto;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;The fdisk tool is perhaps the most recognized disk partitioner in the world, as it has historically been present in Windows and all Unix flavors, among other OSs. While this tool has proven useful for its Linux variant, it as been subject to intense patching along its 20 years of existence, and it is a product of multiple authors, coding styles and concepts. Because of this, extending fdisk&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;background-color: white; color: #333333; line-height: 20px; text-align: -webkit-auto;&quot;&gt;,&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #333333; line-height: 20px; text-align: -webkit-auto;&quot;&gt;&amp;nbsp;to keep up with modern day computing and disk needs is hard, time consuming and error prone. To address this, a serious effort,&amp;nbsp;&lt;/span&gt;initially&amp;nbsp;&lt;a href=&quot;http://code.google.com/soc/&quot;&gt;sponsored&lt;/a&gt;&amp;nbsp;by Google,&lt;/span&gt;&lt;span style=&quot;background-color: white; color: #333333; line-height: 20px; text-align: -webkit-auto;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;was started to redesigned and update fdisk to fit the requirements of a modern disk partitioning program. Some include removing DOS compatibility mode, replacing the deprecated &lt;a href=&quot;http://en.wikipedia.org/wiki/Cylinder-head-sector&quot;&gt;CHS&lt;/a&gt; addressing with&amp;nbsp;&lt;a href=&quot;http://en.wikipedia.org/wiki/Logical_Block_Addressing&quot;&gt;LBA&lt;/a&gt;, GPT support, creating a generic a driver based API that can transparently handle different partition types and major code cleanups and refactoring, among others. While several things have been done, there is still a long ways to go.&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;background-color: white; color: #333333; line-height: 20px; text-align: -webkit-auto;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;I&#39;m pleased to&amp;nbsp;announce&amp;nbsp;that fdisk&amp;nbsp;&lt;/span&gt;&lt;a href=&quot;http://git.kernel.org/?p=utils/util-linux/util-linux.git;a=commit;h=766d5156c43b784700d28d1c1141008b2bf35ed7&quot; style=&quot;font-family: inherit;&quot;&gt;can now work with GPT based disks&lt;/a&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;!!&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;

&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;GUID Partition Table (GPT) , developed by Intel in the late &#39;90s, is a standard for laying out partitioning on hard disks, now forming part of the&amp;nbsp;&lt;/span&gt;&lt;a href=&quot;http://www.uefi.org/&quot; style=&quot;font-family: inherit;&quot;&gt;UEFI&lt;/a&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;standard. Its increasing popularity is easily understandable, as it provides several benefits over the traditional&amp;nbsp;PC&amp;nbsp;master boot record &amp;nbsp;(&lt;/span&gt;&lt;a href=&quot;http://en.wikipedia.org/wiki/Master_boot_record&quot; style=&quot;font-family: inherit;&quot;&gt;MBR&lt;/a&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;) scheme. Furthermore, people using Intel based Apple products (like &lt;/span&gt;&lt;i style=&quot;font-family: inherit;&quot;&gt;&lt;a href=&quot;http://www.apple.com/mac/&quot;&gt;macbooks&lt;/a&gt;&lt;/i&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;) will most likely be using GPT (with a hybrid MBR scheme). While the Internet is full of documents that go into the details of this format, there are a few benefits worth mentioning here:&lt;/span&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;GPT does not know anything about CHS addressing, and only uses LBA (64bit).&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;Because it uses 64bit LBAs, it can hold&lt;span style=&quot;background-color: white; line-height: 19px; text-align: -webkit-auto;&quot;&gt;&amp;nbsp;2&lt;/span&gt;&lt;sup style=&quot;background-color: white; line-height: 1em; text-align: -webkit-auto;&quot;&gt;64&lt;/sup&gt;&lt;span style=&quot;background-color: white; line-height: 19px; text-align: -webkit-auto;&quot;&gt;−1 sectors, typically&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;background-color: white; line-height: 19px; text-align: -webkit-auto;&quot;&gt;9.4&amp;nbsp;Zb with standard 512 byte sectors, way&lt;/span&gt;&amp;nbsp;above the 2Tb limit offered by MBR.&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;GPT uses 32bit CRC checksums to validate data integrity for its headers and partition entries. It also adds redundancy to it&#39;s structures having them present twice, once at the start and again at the end of the disk. This, of course, helps protect the system against disk errors and allows better recovery.&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Some&amp;nbsp;considerations&amp;nbsp;about the implementation:&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;We currently support probing, listing/adding/deleting/writing partitions, data integrity verification. Furthermore, fdisk can determine if there is a traditional protected, or hybrid MBR present.&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;For now, primary header corruption is not&amp;nbsp;recoverable&amp;nbsp;from he backup at the end of the disk.&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;background-color: white; text-align: -webkit-auto;&quot;&gt;Header&amp;nbsp;checksums&amp;nbsp;are&amp;nbsp;updated&amp;nbsp;upon&amp;nbsp;every&amp;nbsp;change&amp;nbsp;(ie:&amp;nbsp;add/delete&amp;nbsp;partitions),&amp;nbsp;this&amp;nbsp;allows&amp;nbsp;us&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;background-color: white; text-align: -webkit-auto;&quot;&gt;to&lt;/span&gt;&lt;/span&gt;&lt;span style=&quot;background-color: white; font-family: inherit; text-align: -webkit-auto;&quot;&gt;&amp;nbsp;mathematically&amp;nbsp;verify &lt;/span&gt;&lt;span style=&quot;background-color: white; font-family: inherit; text-align: -webkit-auto;&quot;&gt;the&amp;nbsp;changes&amp;nbsp;on-the-fly,&amp;nbsp;and&amp;nbsp;not&amp;nbsp;only&amp;nbsp;when&amp;nbsp;writing&amp;nbsp;to&amp;nbsp;disk,&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;background-color: white; font-family: inherit; text-align: -webkit-auto;&quot;&gt;like&lt;/span&gt;&lt;span style=&quot;background-color: white; font-family: inherit; text-align: -webkit-auto;&quot;&gt;&amp;nbsp;most&amp;nbsp;other &amp;nbsp;related&amp;nbsp;tools&amp;nbsp;do.&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;span style=&quot;background-color: white; text-align: -webkit-auto;&quot;&gt;&lt;span style=&quot;font-family: monospace; font-size: x-small;&quot;&gt;&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;When&amp;nbsp;creating&amp;nbsp;a&amp;nbsp;new&amp;nbsp;partition, all partition type &lt;a href=&quot;http://en.wikipedia.org/wiki/GUID_Partition_Table#Partition_type_GUIDs&quot;&gt;GUIDs&lt;/a&gt;&amp;nbsp;are&amp;nbsp;available.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;I&#39;d like to thank both Petr Uzel from SuSE and &lt;a href=&quot;http://karelzak.blogspot.com/&quot;&gt;Karel Zak&lt;/a&gt; from Red Hat for their time reviewing, testing and answering any doubts I had.&lt;/span&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;Enjoy!&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/34931413066027668/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/09/fdisk-updates-and-gpt-support.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/34931413066027668'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/34931413066027668'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/09/fdisk-updates-and-gpt-support.html' title='fdisk updates and GPT support'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-6159280493924136818</id><published>2012-05-26T11:26:00.001-07:00</published><updated>2012-10-03T09:54:13.307-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="architecture"/><category scheme="http://www.blogger.com/atom/ns#" term="associateve"/><category scheme="http://www.blogger.com/atom/ns#" term="caching"/><category scheme="http://www.blogger.com/atom/ns#" term="Intel"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="kvm"/><category scheme="http://www.blogger.com/atom/ns#" term="labels"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="tags"/><category scheme="http://www.blogger.com/atom/ns#" term="TLB"/><category scheme="http://www.blogger.com/atom/ns#" term="translations"/><category scheme="http://www.blogger.com/atom/ns#" term="virtualization"/><category scheme="http://www.blogger.com/atom/ns#" term="VMX"/><category scheme="http://www.blogger.com/atom/ns#" term="x86"/><title type='text'>kvm: Intel associative TLBs</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Traditional x86 architecture implicitly requires TLB flushing upon context switching (&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;CR3&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;writes) so the new process-to-run&#39;s address space does not conflict with lineal to physical translations cached by previous processes. When using shadow pages for MMU virtualization, it can be quite expensive to throw away.&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Intel introduced Virtual Processor ID (&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;vpid)&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;into its VT-x technology in order to tag different processes and therefore avoid&amp;nbsp;&lt;/span&gt;unnecessary&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;TLB flushes.&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;KVM uses a global bitmap to facilitate vpid management for all guests and all vCPUs, managing up to ~64000 unique identifiers. Upon virtual machine startup it will allocate a vpid for each vCPU with a first-come, first-serve policy. The data is protected by a &lt;/span&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;vmx_vpid_lock&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;spinlock.&lt;/span&gt;&lt;/div&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;
&lt;br /&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);  
 static DEFINE_SPINLOCK(vmx_vpid_lock);  
 ...  
 static void allocate_vpid(struct vcpu_vmx *vmx)  
 {  
      int vpid;  
      vmx-&amp;gt;vpid = 0;  
      if (!enable_vpid)  
           return;  
      spin_lock(&amp;amp;vmx_vpid_lock);  
      vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);  
      if (vpid &amp;lt; VMX_NR_VPIDS) {  
           vmx-&amp;gt;vpid = vpid;  
           __set_bit(vpid, vmx_vpid_bitmap);  
      }  
      spin_unlock(&amp;amp;vmx_vpid_lock);  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;Similarly, when the guest is shutdown, it will free its corresponding the vpid(s):&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; static void free_vpid(struct vcpu_vmx *vmx)  
 {  
      if (!enable_vpid)  
           return;  
      spin_lock(&amp;amp;vmx_vpid_lock);  
      if (vmx-&amp;gt;vpid != 0)  
           __clear_bit(vmx-&amp;gt;vpid, vmx_vpid_bitmap);  
      spin_unlock(&amp;amp;vmx_vpid_lock);  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
To invalidate different cached translations based on vpid, Intel added the &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;invvpid &lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;instruction&lt;/span&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;. &lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;The specific invalidations are grouped as (for more information check the Intel reference manual vol. 3C 2.8 - Caching Translation Information):&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;Individual address: the vCPU invalidates translations for a specific &amp;nbsp;given address and PID&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;Single context: the vCPU invalidates all tagged translations for a specific given VPID&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;All context: the vCPU invalidates all translations for all VPIDs (except the original, id 0)&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;Single context, retaining global translations: the vCPU invalidates all tagged translations for a specific given VPID, except global translations.&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Whenever there&#39;s a TLB flush call or a vCPU reset (like when setting up the architecture at boot time), both part of standard x86 operations, the &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;vpid_sync_context() &lt;/span&gt;function is called:&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;pre style=&quot;background-attachment: initial; background-clip: initial; background-color: #f0f0f0; background-image: initial; background-origin: initial; border-bottom-color: rgb(204, 204, 204); border-bottom-style: dashed; border-bottom-width: 1px; border-image: initial; border-left-color: rgb(204, 204, 204); border-left-style: dashed; border-left-width: 1px; border-right-color: rgb(204, 204, 204); border-right-style: dashed; border-right-width: 1px; border-top-color: rgb(204, 204, 204); border-top-style: dashed; border-top-width: 1px; height: auto; overflow-x: auto; overflow-y: auto; padding-bottom: 0px; padding-left: 0px; padding-right: 0px; padding-top: 0px; width: 646px;&quot;&gt;&lt;code style=&quot;font-family: arial; font-size: 12px; line-height: 20px; word-wrap: normal;&quot;&gt; &lt;/code&gt;&lt;span style=&quot;background-color: transparent; font-size: 12px; line-height: 20px;&quot;&gt;static inline void vpid_sync_context(struct vcpu_vmx *vmx)
{
 if (cpu_has_vmx_invvpid_single())
  vpid_sync_vcpu_single(vmx);
 else
  vpid_sync_vcpu_global();
}&lt;/span&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;line-height: 20px; white-space: pre;&quot;&gt;This function calls the corresponding invalidation type, previously described. The&amp;nbsp;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;line-height: 20px; white-space: pre;&quot;&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;vpid_sync_vcpu_single()&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt; routine &lt;/span&gt;obviously must pass the &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;vmx-&amp;gt;vpid &lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;in order to specify what id its referring to.&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;line-height: 20px; white-space: pre;&quot;&gt;Both global and single contexts end up calling&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt; __invvpid(),&lt;/span&gt; that does all assembler the work. &lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: monospace;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The VPID feature can be enabled/disabled by traditional kernel module parameters &amp;nbsp;at&amp;nbsp;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;/sys/module/kvm_intel/parameters/vpid&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
A while ago I proposed a &lt;a href=&quot;https://lkml.org/lkml/2012/3/11/94&quot;&gt;patch&lt;/a&gt; to enable tracing vpid management for simulating tagged TLB&amp;nbsp;behavior&amp;nbsp;and performance. Unfortunately tracing these events for experimentation/research did not suit mainstream enough to be&amp;nbsp;officially&amp;nbsp;merged. Understandable.&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/6159280493924136818/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/05/kvm-intel-associative-tlbs.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6159280493924136818'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/6159280493924136818'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/05/kvm-intel-associative-tlbs.html' title='kvm: Intel associative TLBs'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-7091395055513332572</id><published>2012-05-03T15:02:00.002-07:00</published><updated>2012-09-28T07:40:18.981-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="concurrency"/><category scheme="http://www.blogger.com/atom/ns#" term="development"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="locks"/><category scheme="http://www.blogger.com/atom/ns#" term="lslk"/><category scheme="http://www.blogger.com/atom/ns#" term="lslocks"/><title type='text'>linux local system locks</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;lslk(8)&lt;/span&gt; program has been unmaintained and deprecated for over a decade now, since 2001. &amp;nbsp;I&#39;ve recently rewritten the tool, now called &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;lslocks(8)&lt;/span&gt;&lt;span style=&quot;font-family: Arial, Helvetica, sans-serif;&quot;&gt;&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;that allows an easier and up-to-date way of seeing all the current file held locks in a Linux system. This program will be shipping soon with standard system tools and available in your&amp;nbsp;&lt;/span&gt;favorite&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;distribution.&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;Some important modifications include removing legacy Unix outputs and options, for example:&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;Don&#39;t output inode number, whence and maj:min device numbers.&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;Don&#39;t provide nonblocking syscall options stat(2) and readlink(2).&amp;nbsp;&lt;/li&gt;
&lt;ul&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;The option to use nonblocking calls was previously intended for NFS partitions;&amp;nbsp;however this should be transparent to utility programs considering that&amp;nbsp;timeouts can occur generically in any context (fuse - sshfs, NFS, netdevs, etc).&lt;/li&gt;
&lt;/ul&gt;
&lt;/ul&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The command itself is quite straightforward - KISS:&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; $&amp;gt; lslocks   
 COMMAND      PID TYPE SIZE MODE M   START    END PATH  
 smbd       1379 POSIX  5B WRITE 0     0     0 /var/run/samba/smbd.pid  
 smbd       1379 POSIX 696B READ 0     4     4 /var/run/samba/messages.tdb  
 ...
 smbd       1379 POSIX 696B READ 0     4     4 /var/run/samba/gencache_notrans.tdb  
 smbd       1513 POSIX 696B READ 0     4     4 /var/run/samba/messages.tdb  
 (unknown)    1717 FLOCK  0B WRITE 0     0     0 /var/run  
 atd       1793 POSIX  5B WRITE 0     0     0 /var/run/atd.pid  
 sendmail-mta   2004 POSIX  52B WRITE 0     0     0 /var/run/sendmail/mta/sendmail.pid  
 nmbd       2292 POSIX  5B WRITE 0     0     0 /var/run/samba/nmbd.pid  
 nmbd       2292 POSIX 696B READ 0     4     4 /var/run/samba/messages.tdb  
 nmbd       2292 POSIX 108K READ 0     4     4 /var/run/samba/connections.tdb  
 cat       3221 POSIX  0B WRITE 0     0     0 /home/dave/.local/share/zeitgeist/fts.index/flintlock  
 zeitgeist-daemo 3211 POSIX 989K WRITE 0 1073741824 1073742335 /home/dave/.local/share/zeitgeist/activity.sqlite  
 chromium-browse 4306 POSIX 202K WRITE 0 1073741824 1073742335 /home/dave/.config/chromium/Default/Web Data  
 ...
&lt;/code&gt;&lt;/pre&gt;
&lt;pre style=&quot;background-attachment: initial; background-clip: initial; background-color: #f0f0f0; background-image: initial; background-origin: initial; background-position: initial initial; background-repeat: initial initial; border-bottom-color: rgb(204, 204, 204); border-bottom-style: dashed; border-bottom-width: 1px; border-image: initial; border-left-color: rgb(204, 204, 204); border-left-style: dashed; border-left-width: 1px; border-right-color: rgb(204, 204, 204); border-right-style: dashed; border-right-width: 1px; border-top-color: rgb(204, 204, 204); border-top-style: dashed; border-top-width: 1px; color: black; font-size: 12px; height: auto; line-height: 20px; overflow-x: auto; overflow-y: auto; padding-bottom: 0px; padding-left: 0px; padding-right: 0px; padding-top: 0px; text-align: left; width: 99%;&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
We can quickly see the command name and PID that currently cause a lock to be held, as well as its size and canonical path. The lock itself, can be&amp;nbsp;&amp;nbsp;FLOCK (created with &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;flock(2)&lt;/span&gt;) or POSIX (created with &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;fcntl(2)&lt;/span&gt; and &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;lockf(2)&lt;/span&gt;) - I won&#39;t go over explaining the differences as it&#39;s not in the context of this post. The start and end are the relative byte offset of the lock.&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Enjoy!&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/7091395055513332572/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/05/linux-local-system-locks.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7091395055513332572'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/7091395055513332572'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/05/linux-local-system-locks.html' title='linux local system locks'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-1595811657033117733</id><published>2012-03-05T10:52:00.001-08:00</published><updated>2012-09-28T07:39:14.814-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="hardware"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="kvm"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="memory management"/><category scheme="http://www.blogger.com/atom/ns#" term="mmu"/><category scheme="http://www.blogger.com/atom/ns#" term="x86"/><title type='text'>kvm: virtual x86 mmu setup</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
One of the initialization steps that KVM does when a virtual machine (VM) is started, is setting up the vCPU&#39;s memory management unit (MMU) to translate virtual (lineal) addresses into physical ones within the guest&#39;s domain. For x86, which is what will be covered here, most of the corresponding code is in&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt; &amp;lt;kernel&amp;gt;/arch/x86/kvm/mmu.c&lt;/span&gt;.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;b&gt;&lt;i&gt;Disclaimer:&lt;/i&gt;&lt;/b&gt;&amp;nbsp;Although this document requires at least some basic knowledge of x86 paging and traditional virtual memory, I hope it can be useful for people that are interested in low-level virtualization, linux kernel and/or KVM internals in general.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The first step call&amp;nbsp;&lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;kvm_mmu_setup()&lt;/span&gt;&lt;/b&gt;&amp;nbsp;which simple does some trivial asserting and calls&amp;nbsp;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;b&gt;init_kvm_mmu()&lt;/b&gt;&lt;/span&gt;:&lt;/div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; static int init_kvm_mmu(struct kvm_vcpu *vcpu)  
 {  
      if (mmu_is_nested(vcpu))  
           return init_kvm_nested_mmu(vcpu);  
      else if (tdp_enabled)  
           return init_kvm_tdp_mmu(vcpu);  
      else  
           return init_kvm_softmmu(vcpu);  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The first check is regarding nested MMUs, which is to run VMMs within guests, having yet another layer of indirection. This is part of the &lt;b&gt;Turtles project&lt;/b&gt; and won&#39;t be covered in this document, but it is well documented &lt;a href=&quot;http://www.mulix.org/pubs/turtles/h-0282.pdf&quot;&gt;elsewhere&lt;/a&gt;.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;tdp_enabled&lt;/span&gt;&amp;nbsp;(two dimentional paging) boolean variable determines wether or not hardware assisted paging (EPT or RVI/NPT) is enabled. &amp;nbsp;If true, it will use 2D paging, otherwise, the default option, shadow paging through software only support. Since KVM can be built as a kernel module, it uses the user&#39;s options to set the variable&#39;s value, with &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;kvm_enable_tdp()&lt;/span&gt;&lt;/b&gt; and &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;kvm_disable_tdp()&lt;/span&gt;&lt;/b&gt;. For example, users can check&amp;nbsp;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;/sys/modules/kvm_intel/parameters/ept&lt;/span&gt;&amp;nbsp;to verify if EPT is enabled or not. Most distributions will load the module with it enabled, anyway:&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;pre style=&quot;background-attachment: initial; background-clip: initial; background-color: #f0f0f0; background-image: initial; background-origin: initial; border-bottom-color: rgb(204, 204, 204); border-bottom-style: dashed; border-bottom-width: 1px; border-left-color: rgb(204, 204, 204); border-left-style: dashed; border-left-width: 1px; border-right-color: rgb(204, 204, 204); border-right-style: dashed; border-right-width: 1px; border-top-color: rgb(204, 204, 204); border-top-style: dashed; border-top-width: 1px; color: black; font-size: 12px; height: auto; line-height: 20px; margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px; overflow-x: auto; overflow-y: auto; padding-bottom: 0px; padding-left: 0px; padding-right: 0px; padding-top: 0px; text-align: left; width: 646px;&quot;&gt;#&amp;gt; modprobe kvm_intel ept=1&lt;/pre&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Both &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;init_kvm_tdp_mmu()&lt;/span&gt;&lt;/b&gt;and &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;init_kvm_softmmu()&lt;/span&gt;&lt;/b&gt;&amp;nbsp;are responsible for setting up how the guest&#39;s page walking will be handled, by populating the &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;walk_mmu&lt;/span&gt; structure. This structure abstracts the details of architecture-specific paging modes, allowing common operations like loading and setting CR3 for upper page level base pointer, flushing TLB entries (&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;invlpg&lt;/span&gt;) and page fault handing, among others.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Just like traditional, non virtualized environments, the guest&#39;s MMU must be capable of handling paging in 32bit, PAE, 64bit, optionally it can have paging disabled, so guest virtual addresses (gva) are the actual guest physical addresses (gpa), mapped 1:1. This is quite obvious since the guest&#39;s does not know that its MMU is the one KVM presents to a it, and not the real, physical one - making everything transparent - which is not the case for paravirtualization, like Xen.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;span style=&quot;font-size: large;&quot;&gt;&lt;u&gt;Hardware support initialization&lt;/u&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style=&quot;font-family: inherit;&quot;&gt;Most logic is done in this single function:&lt;/span&gt;&lt;br /&gt;
&lt;pre style=&quot;background-color: #eeeeee; border: 1px dashed #999999; color: black; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding: 5px; width: 100%;&quot;&gt;&lt;code&gt;static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
    struct kvm_mmu *context = vcpu-&amp;gt;arch.walk_mmu;

    context-&amp;gt;base_role.word = 0;
    context-&amp;gt;new_cr3 = nonpaging_new_cr3;
    context-&amp;gt;page_fault = tdp_page_fault;
    context-&amp;gt;free = nonpaging_free;
    context-&amp;gt;sync_page = nonpaging_sync_page;
    context-&amp;gt;invlpg = nonpaging_invlpg;
    context-&amp;gt;update_pte = nonpaging_update_pte;
    context-&amp;gt;shadow_root_level = kvm_x86_ops-&amp;gt;get_tdp_level();
    context-&amp;gt;root_hpa = INVALID_PAGE;
    context-&amp;gt;direct_map = true;
    context-&amp;gt;set_cr3 = kvm_x86_ops-&amp;gt;set_tdp_cr3;
    context-&amp;gt;get_cr3 = get_cr3;
    context-&amp;gt;get_pdptr = kvm_pdptr_read;
    context-&amp;gt;inject_page_fault = kvm_inject_page_fault;

    if (!is_paging(vcpu)) {
        context-&amp;gt;nx = false;
        context-&amp;gt;gva_to_gpa = nonpaging_gva_to_gpa;
        context-&amp;gt;root_level = 0;
    } else if (is_long_mode(vcpu)) {       
        context-&amp;gt;nx = is_nx(vcpu);
        reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
        context-&amp;gt;gva_to_gpa = paging64_gva_to_gpa;
        context-&amp;gt;root_level = PT64_ROOT_LEVEL;
    } else if (is_pae(vcpu)) {
        context-&amp;gt;nx = is_nx(vcpu);
        reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
        context-&amp;gt;gva_to_gpa = paging64_gva_to_gpa;
        context-&amp;gt;root_level = PT32E_ROOT_LEVEL;
    } else {
        context-&amp;gt;nx = false;
        reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
        context-&amp;gt;gva_to_gpa = paging32_gva_to_gpa;
        context-&amp;gt;root_level = PT32_ROOT_LEVEL;
    }


    return 0;
}
&lt;/code&gt;&lt;/pre&gt;
&lt;div&gt;
&lt;ol style=&quot;text-align: left;&quot;&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;The &lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;is_paging()&lt;/span&gt;&lt;/b&gt; function simply checks the vCPU&#39;s &lt;a href=&quot;http://www.sandpile.org/x86/mode.htm&quot;&gt;CR0.PG&lt;/a&gt; flag to see if paging is enabled or not - this will most likely be enabled!&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;The &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;b&gt;is_long_mode()&lt;/b&gt;&lt;/span&gt;checks if the guest has a 64bit vCPU, by reading the&amp;nbsp;&lt;a href=&quot;http://www.sandpile.org/x86/mode.htm&quot;&gt;EFER.LMA&lt;/a&gt; (long mode active) bits, assuming, of course, CONFIG_X86_64 is set, since 64bit guests &lt;a href=&quot;http://www.linux-kvm.org/page/FAQ#Can_KVM_run_a_32-bit_guest_on_a_64-bit_host.3F_What_about_PAE.3F&quot;&gt;cannot&lt;/a&gt; run on 32bit hosts.&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;If PAE is enabled, then &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace; font-weight: bold;&quot;&gt;is_pae()&lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;&#39;s &lt;a href=&quot;http://www.sandpile.org/x86/mode.htm&quot;&gt;CR4.PAE&lt;/a&gt;&amp;nbsp;check will return&amp;nbsp;&lt;/span&gt;successfully&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;and indicate that the physical address&amp;nbsp;&lt;/span&gt;extension&lt;span style=&quot;font-family: inherit;&quot;&gt;&amp;nbsp;is present, and the 32bit guest can reference more than 4Gb of address space.&lt;/span&gt;&lt;/li&gt;
&lt;li style=&quot;text-align: justify;&quot;&gt;Finally, if the above three fail, its assumed that the guest works in standard 32bit mode.&lt;/li&gt;
&lt;/ol&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
No matter what mode is set, no-execution bits, rsvds bits, what function will handle gva to gpa translation and the paging&#39;s root level is set:&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;-&amp;gt;nx&lt;/span&gt; flag refers to No-eXecution bits to separate areas of memory from being executed, avoiding buffer overflow attacks. This is obtained by checking vCPU&#39;s&amp;nbsp;&lt;a href=&quot;http://www.sandpile.org/x86/mode.htm&quot;&gt;EFER.NX&lt;/a&gt;&amp;nbsp;flag.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;-&amp;gt;gva_to_gpa&lt;/span&gt; is the function that will handle guest&#39;s virtual to physical translations, discussed &lt;a href=&quot;http://blog.stgolabs.net/2012/03/kvm-hardware-assisted-paging.html&quot;&gt;here&lt;/a&gt;. When paging is disabled, the gpa is returned, and for the other modes, &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;gva_to_gpa()&lt;/span&gt; is the same function (defined in&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt; paging_tmpl.h&lt;/span&gt;), but varies according to the root level and paging mode.&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;b style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;reset_rsvds_bit_mask()&amp;nbsp;&lt;/b&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;function just sets the reserved machine memory.&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Finally, the page walker&#39;s &lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;-&amp;gt;root_level &lt;/span&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;refers to the amount of hierarchical levels of guest&#39;s paging. With the standard 4k page size, 64bits will have four (PML4, PDP, PD, PTE), 32bits will have two (PD, PTE) and PAE will have three (PDP, PD, PTE). If paging is disabled, there obviously won&#39;t be any levels to walk.&lt;/span&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span style=&quot;font-size: large;&quot;&gt;&lt;u&gt;Software support initialization&lt;/u&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Unlike hardware support, most of the work for setting up software MMU and shadow page is done by &lt;b style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;kvm_init_shadow_mmu()&lt;/b&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;,&lt;/span&gt;&lt;b style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&amp;nbsp;&lt;/b&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt;while&lt;/span&gt;&lt;b style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt; init_kvm_softmmu()&lt;/b&gt;&lt;span style=&quot;font-family: inherit;&quot;&gt; simply calls it and later sets control register 3, page directory pointer and how the VMM will emulate (inject) and propagate the page faults.&lt;/span&gt;&lt;/div&gt;
&lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;br /&gt;&lt;/span&gt;&lt;/b&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;/div&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; static int init_kvm_softmmu(struct kvm_vcpu *vcpu)  
 {  
      int r = kvm_init_shadow_mmu(vcpu, vcpu-&amp;gt;arch.walk_mmu);  

      vcpu-&amp;gt;arch.walk_mmu-&amp;gt;set_cr3           = kvm_x86_ops-&amp;gt;set_cr3;  
      vcpu-&amp;gt;arch.walk_mmu-&amp;gt;get_cr3           = get_cr3;  
      vcpu-&amp;gt;arch.walk_mmu-&amp;gt;get_pdptr         = kvm_pdptr_read;  
      vcpu-&amp;gt;arch.walk_mmu-&amp;gt;inject_page_fault = kvm_inject_page_fault;  
      return r;  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
The &lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;kvm_init_shadow_mmu()&lt;/span&gt;&lt;/b&gt; function is quite similar to what was discussed above, based on the paging modes, it sets how the walker will work&amp;nbsp;&lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;paging32_init_context_common() &lt;/span&gt;&lt;/b&gt;and&amp;nbsp;&lt;b&gt;&lt;span style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;paging64_init_context_common()&lt;/span&gt;&lt;/b&gt;, for 64bit and PAE systems.&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/1595811657033117733/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/03/kvm-virtual-x86-mmu-setup.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1595811657033117733'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/1595811657033117733'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/03/kvm-virtual-x86-mmu-setup.html' title='kvm: virtual x86 mmu setup'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-5696484881751133998</id><published>2012-03-03T16:11:00.000-08:00</published><updated>2012-03-09T05:20:51.107-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="ept"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="kvm"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="memory management"/><category scheme="http://www.blogger.com/atom/ns#" term="paging"/><category scheme="http://www.blogger.com/atom/ns#" term="shadow pages"/><category scheme="http://www.blogger.com/atom/ns#" term="virtualization"/><category scheme="http://www.blogger.com/atom/ns#" term="x86"/><title type='text'>kvm: hardware assisted paging</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
CPU vendors began adding hardware virtual memory management unit (vMMU) support circa 2009, with Intel&#39;s VT-x (vmx flag) addition. Historically, the guest&#39;s physical (gpa) to host physical &amp;nbsp;(hpa) addresses where translated through software, using shadow page tables. These tables are kept synchronized with the guest&#39;s page tables, and are one of the main sources of overhead in virtual machines, as they incur in expensive vm exits. A common way of keeping the shadow pages up to date are to write-protect the guest&#39;s pages, so that when they are changed, page faults are triggered and intercepted by the VMM, which emulates it (injecting the page) and updating the shadow ones, accordingly. This, of course, is transparent to the guest. Another major problem, is that TLB semantics require flushes upon context switching, as newly assigned processes need to have it empty to cache entries&amp;nbsp;only belonging&amp;nbsp;to the process&#39;s address space. To overcome this, CPUs now incorporate tags into the TLB - also known as &lt;i&gt;vpid&lt;/i&gt;, which allow mapping that associate addresses to processes and thus reducing the amount of flushes.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
With hardware vMMUs, i&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;n order to avoid the VMM overhead with shadow paging, the guest is left alone to update its
page tables, while the hardware maintains its own&amp;nbsp;page tables which maps gpa to hpa. Intel calls these Extended Page Tables (EPT).&amp;nbsp;&lt;/span&gt;Having
two page tables now requires that when a guest translates and address, two levels must be walked (sometimes
referred to as 2D page walks). So hardware support can come at a greater cost for &lt;b&gt;programs with bad locality&lt;/b&gt; and cache unfriendly, than its software equivalent. When a TLB miss occurs, and the guest does a page walk, for each hierarchical level, the entire EPT must be walked as well, to obtain the hpa. For 64bit guests, this is worse than 32bit ones, &amp;nbsp;as the 64bit address space requires more levels (PML4, PDP, PD, PTE) of translation.&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
KVM&#39;s implementation of EPT is quite unique and uses both the guest&#39;s tables and the hardware&#39;s to translate addresses.&amp;nbsp;When a guest needs to translate virtual addresses to physical ones, the &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;gva_to_gpa()&lt;/span&gt;&lt;/b&gt;function is called:&lt;br /&gt;
&lt;br /&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,  
                                struct x86_exception *exception)  
 {  
      struct guest_walker walker;  
      gpa_t gpa = UNMAPPED_GVA;  
      int r;  
      r = FNAME(walk_addr)(&amp;amp;walker, vcpu, vaddr, access);  
      if (r) {  
           gpa = gfn_to_gpa(walker.gfn);  
           gpa |= vaddr &amp;amp; ~PAGE_MASK;  
      } else if (exception)  
           *exception = walker.fault;  
      return gpa;  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;br /&gt;
If the guest&#39;s walk fails and the gva-gpa mapping is not present, a page fault is raised, and &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;tdp_page_fault()&lt;/span&gt;&lt;/b&gt; - two diminutional paging - is invoked through an EPT violation - &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;handle_ept_violation()&lt;/span&gt;&lt;/b&gt; to translate gpa to hpa. A new page table entry is created and the shadow page code is reused through &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;mmu_set_spte()&lt;/span&gt;&lt;/b&gt;and added to the beginning of the page list through &lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;pte_list_add()&lt;/span&gt;&lt;/b&gt;. This way, the next time the guest virtual address is accessed, it will already be in the guest&#39;s pages and&lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt; walk_addr()&lt;/span&gt;&lt;/b&gt; will be done successfully, and the gpa can be returned without further a due.&amp;nbsp;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/5696484881751133998/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/03/kvm-hardware-assisted-paging.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5696484881751133998'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/5696484881751133998'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/03/kvm-hardware-assisted-paging.html' title='kvm: hardware assisted paging'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-2702944708370417142</id><published>2012-01-30T16:09:00.000-08:00</published><updated>2012-01-30T16:27:25.954-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="development"/><category scheme="http://www.blogger.com/atom/ns#" term="linux inode filename filesystem symlinks"/><title type='text'>inode to filename</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
We normally have a file&#39;s canonical/absolute path, and with that we can get just about any details from it, usually through &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;stat(2)-family&lt;/span&gt;. What about when we have the inode number? I had to come up with this little ugly function to parse (luckily&amp;nbsp;I also had the PID) procfs and go comparing all the files... we can do better!&lt;br /&gt;
&lt;br /&gt;
I&#39;m hoping someone can tell me a more straightforward way of doing this - specially considering that something very similar will be in upcoming linux distros.&lt;br /&gt;
&lt;br /&gt;
&lt;a href=&quot;http://pastebin.com/5TxB4TMa&quot;&gt;http://pastebin.com/5TxB4TMa&lt;/a&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/2702944708370417142/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/01/inode-to-filename.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/2702944708370417142'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/2702944708370417142'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/01/inode-to-filename.html' title='inode to filename'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-4818443475957085176</id><published>2012-01-28T14:54:00.000-08:00</published><updated>2012-02-04T17:02:54.444-08:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="books"/><category scheme="http://www.blogger.com/atom/ns#" term="computer science"/><category scheme="http://www.blogger.com/atom/ns#" term="development"/><category scheme="http://www.blogger.com/atom/ns#" term="systems"/><category scheme="http://www.blogger.com/atom/ns#" term="unix"/><title type='text'>an (incomplete) list of indispensable systems books</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
If you&#39;re interested in a career in computer systems, here is an unsorted list of books you should get your hands on. Some are UNIX related, but hey, that&#39;s my area of knowledge and, in one way or another, they have all helped me grow as a computer scientist.&lt;br /&gt;
&lt;div&gt;
&lt;ul style=&quot;text-align: left;&quot;&gt;
&lt;li&gt;Kernighan, Brian and Ritchie, Dennis. &lt;a href=&quot;http://www.amazon.com/Programming-Language-2nd-Brian-Kernighan/dp/0131103628/ref=sr_1_1?ie=UTF8&amp;amp;qid=1327789528&amp;amp;sr=8-1&quot;&gt;The C Programming Language (2nd Ed.)&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Hennessy, John L. and Patterson, David A. &lt;a href=&quot;http://www.amazon.com/Computer-Architecture-Fifth-Quantitative-Approach/dp/012383872X/ref=sr_1_1?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327789624&amp;amp;sr=1-1&quot;&gt;Computer Architecture: A Quantitate Approach&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Bach, Maurice J.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Design-Operating-System-Prentice-Hall-Software/dp/0132017997/ref=sr_1_1?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327789785&amp;amp;sr=1-1&quot;&gt;The Design of the UNIX Operating System&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Silverschatz, Abraham. and Galvin, Peter. and Gagne, Greg.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Operating-System-Concepts-Abraham-Silberschatz/dp/0470128720/ref=sr_1_1?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327789860&amp;amp;sr=1-1&quot;&gt;Operating Systems Concepts&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Stevens, Richard W. and Rago, Stephen A.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Programming-Environment-Addison-Wesley-Professional-Computing/dp/0321525949/ref=sr_1_2?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327790121&amp;amp;sr=1-2&quot;&gt;Advanced Programming in the UNIX Environment&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Raymond, Eric S.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Art-UNIX-Programming-Eric-Raymond/dp/0131429019/ref=sr_1_8?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327789785&amp;amp;sr=1-8&quot;&gt;The Art of UNIX Programming&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Duntemann, Jeff.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Assembly-Language-Step---step-Programming/dp/0471375233/ref=sr_1_11?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327790222&amp;amp;sr=1-11&quot;&gt;Assembly Language Step by Step (2nd Ed.)&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Kernighan, Brian and Pike, Rob.&amp;nbsp;&lt;a href=&quot;http://www.amazon.com/Practice-Programming-Brian-W-Kernighan/dp/020161586X/ref=sr_1_3?ie=UTF8&amp;amp;qid=1327790352&amp;amp;sr=8-3&quot;&gt;The Practice of Programming&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Love, Robert. &lt;a href=&quot;http://www.amazon.com/Linux-Kernel-Development-Robert-Love/dp/0672329468/ref=sr_1_sc_1?s=books&amp;amp;ie=UTF8&amp;amp;qid=1327790436&amp;amp;sr=1-1-spell&quot;&gt;Linux Kernel Development (3rd Ed.)&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/4818443475957085176/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2012/01/incomplete-list-of-indispensable.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/4818443475957085176'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/4818443475957085176'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2012/01/incomplete-list-of-indispensable.html' title='an (incomplete) list of indispensable systems books'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-5789291509148224079.post-2374035049709283101</id><published>2011-12-18T10:36:00.000-08:00</published><updated>2012-09-28T07:36:48.518-07:00</updated><category scheme="http://www.blogger.com/atom/ns#" term="assembler"/><category scheme="http://www.blogger.com/atom/ns#" term="attributes"/><category scheme="http://www.blogger.com/atom/ns#" term="caches"/><category scheme="http://www.blogger.com/atom/ns#" term="cpu"/><category scheme="http://www.blogger.com/atom/ns#" term="cpuid"/><category scheme="http://www.blogger.com/atom/ns#" term="kernel"/><category scheme="http://www.blogger.com/atom/ns#" term="linux"/><category scheme="http://www.blogger.com/atom/ns#" term="processor"/><category scheme="http://www.blogger.com/atom/ns#" term="x86"/><title type='text'>linux and processor attributes</title><content type='html'>&lt;div dir=&quot;ltr&quot; style=&quot;text-align: left;&quot; trbidi=&quot;on&quot;&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
I was having some trouble finding my CPU&#39;s TLB page size and data entries a few days ago, and it&#39;s no mystery that Intel provides very poor specs in this specific area. I couldn&#39;t see it exported from Linux either (although it *does* list it in /proc/cpuinfo, depending on the L1/L2 cache sizes and attributes, but that&#39;s another story).&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
To overcome this I was forced to write my own little program that uses the &lt;b&gt;cpuid&lt;/b&gt; instruction (x86-family specific, introduced in the early 90s) to obtain processor attributes like vendor/model, cache sizes, flags, etc. Since this is the way the kernel actually gets the information, it might be useful to share some basic information of this feature... again, this is x86 only.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
This instruction reads the EAX register to know what information the caller is asking for, and with 0 it will return all the available attributes; thus a smart implementation will use this first, then decide if the information we want is available. The outputs of the instruction are loaded into &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;EAX&lt;/span&gt; (yes, this is input and ouput), &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;EBX&lt;/span&gt;, &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;ECX&lt;/span&gt; and &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;EDX&lt;/span&gt;, and to use it all we need to know are the register bit offsets, well documented &lt;a href=&quot;http://www.sandpile.org/x86/cpuid.htm&quot;&gt;here&lt;/a&gt;.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Calling cpuid in C is quite trivial, just specify the operation level, load it into &lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;EAX&lt;/span&gt; and return the out registers to return the value(s) through reference:&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt; void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,  
       unsigned int *ecx, unsigned int *edx)  
 {  
  __asm__(  
  &quot;cpuid;&quot;  
  : &quot;=b&quot; (*ebx), &quot;=a&quot; (*eax),&quot;=c&quot; (*ecx),&quot;=d&quot; (*edx)  
  : &quot;1&quot; (op), &quot;c&quot;(0));  
 }  
&lt;/code&gt;&lt;/pre&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
Now, say we want to get the size of the L2 cache and some power management information, so by looking at the &lt;a href=&quot;http://www.sandpile.org/x86/cpuid.htm&quot;&gt;reference&lt;/a&gt;&amp;nbsp;I know to load&amp;nbsp;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;0x80000006&amp;nbsp;&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;and &lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;ECX&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt; will hold my data in bits 31-16 for the L2 size in Kb, and the CPU thermal monitoring in bit 4 of EDX when loading level&amp;nbsp;&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;0x80000007&lt;/span&gt;. So we have:&lt;/div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;pre style=&quot;background-image: URL(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi64Lr_VvHGv8ertaDsasLVFpAl2QN8C251CJthu4PBll3Z8HPLLm3o2sPau-FaPEnvN3SrkmpSi-rmzXBtXboy8PvTcz3SqZ3NAFfNc_hM2KWi5dPz29TnaBLYzNNa3GPDl8DgnFhzYilW/s320/codebg.gif); background: #f0f0f0; border: 1px dashed #CCCCCC; color: black; font-family: arial; font-size: 12px; height: auto; line-height: 20px; overflow: auto; padding: 0px; text-align: left; width: 99%;&quot;&gt;&lt;code style=&quot;color: black; word-wrap: normal;&quot;&gt;      unsigned int *eax, *ebx, *ecx, *edx;  
      cpuid(0x80000006, &amp;amp;eax, &amp;amp;ebx, &amp;amp;ecx, &amp;amp;edx);  
      printf(&quot;my L2 cache is %dKb\n&quot;, ecx&amp;gt;&amp;gt;16);  
      cpuid(0x80000007, &amp;amp;eax, &amp;amp;ebx, &amp;amp;ecx, &amp;amp;edx);  
      printf(&quot;my EPM thermal monitor is %d\n&quot;, edx&amp;gt;&amp;gt;4);  
&lt;/code&gt;&lt;/pre&gt;
&lt;div&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;&lt;/span&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;The kernel does exactly this to determine the processor(s) information, of course with a little more precaution and&amp;nbsp;&lt;/span&gt;optimization&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;, but in the end what you see in &lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;/proc/cpuinfo&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt; (and therefore &lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;lscpu&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;) is a result of &lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;cpuid&lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;.&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;div style=&quot;text-align: justify;&quot;&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt;For further details read &lt;/span&gt;&lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;cpu_detect_cache_size()&lt;/span&gt;&lt;/b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt; and &lt;/span&gt;&lt;b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;cpu_get_model_name()&lt;/span&gt;&lt;/b&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: inherit;&quot;&gt; in &lt;/span&gt;&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;arch/x86/kernel/cpu/common.c&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;happy hacking!&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;br /&gt;
&lt;/span&gt;&lt;/div&gt;
&lt;div&gt;
&lt;span class=&quot;Apple-style-span&quot; style=&quot;font-family: &#39;Courier New&#39;, Courier, monospace;&quot;&gt;&lt;/span&gt;&lt;/div&gt;
&lt;/div&gt;
</content><link rel='replies' type='application/atom+xml' href='http://blog.stgolabs.net/feeds/2374035049709283101/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://blog.stgolabs.net/2011/12/linux-and-processor-attributes.html#comment-form' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/2374035049709283101'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/5789291509148224079/posts/default/2374035049709283101'/><link rel='alternate' type='text/html' href='http://blog.stgolabs.net/2011/12/linux-and-processor-attributes.html' title='linux and processor attributes'/><author><name>Unknown</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='https://img1.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry></feed>