Struct Field Alignment
Optimize struct memory layout by understanding CPU alignment, padding, false sharing, and cache lines.
CPU Cache Architecture and Memory Alignment
Modern CPUs organize memory access around 64-byte cache lines (on most x86-64 and ARM64 processors). This is the smallest unit of data that can be transferred from main memory to the L1 cache. When the CPU needs to access a single byte, it loads the entire 64-byte cache line into cache. This architectural feature fundamentally impacts struct layout optimization.
Alignment Fundamentals
Each data type has an alignment requirement based on its size:
| Type | Size | Alignment Requirement |
|---|---|---|
bool, uint8, int8 | 1 byte | 1 (no alignment) |
uint16, int16 | 2 bytes | 2 |
uint32, int32, float32 | 4 bytes | 4 |
uint64, int64, float64, *T | 8 bytes | 8 |
string | 16 bytes | 8 (pointer-sized) |
[]T | 24 bytes | 8 (slice header) |
CPUs read data most efficiently when the address is a multiple of the type's size. Misaligned access requires multiple memory operations: the CPU must load two cache lines, extract the data from both, and combine them. This adds 1-3 extra cycles per misaligned access. Structs automatically align to the largest field's alignment requirement.
Memory Layout Example
package main
import (
"fmt"
"unsafe"
)
type Unaligned struct {
a uint8 // 1 byte at offset 0
// 3 bytes padding (rounds to 4-aligned for next field)
b uint32 // 4 bytes at offset 4, requires 4-aligned address
c uint8 // 1 byte at offset 8
// 7 bytes padding (struct must be 8-aligned due to alignment rules)
}
type Aligned struct {
b uint32 // 4 bytes at offset 0, 4-aligned
a uint8 // 1 byte at offset 4
c uint8 // 1 byte at offset 5
// 2 bytes padding (rounds struct to 8 bytes)
}
type WellAligned struct {
b uint32 // 4 bytes
a uint8 // 1 byte
c uint8 // 1 byte
// Can pack here due to size constraints
// Total: 6 bytes, padded to 8
}
func main() {
u := Unaligned{}
a := Aligned{}
w := WellAligned{}
fmt.Printf("Unaligned struct:\n")
fmt.Printf(" Size: %d bytes\n", unsafe.Sizeof(u))
fmt.Printf(" a offset: %d, b offset: %d, c offset: %d\n",
unsafe.Offsetof(u.a), unsafe.Offsetof(u.b), unsafe.Offsetof(u.c))
// Unaligned: size=16 bytes (7 bytes wasted padding: 44% overhead)
fmt.Printf("Aligned struct:\n")
fmt.Printf(" Size: %d bytes\n", unsafe.Sizeof(a))
fmt.Printf(" b offset: %d, a offset: %d, c offset: %d\n",
unsafe.Offsetof(a.b), unsafe.Offsetof(a.a), unsafe.Offsetof(a.c))
// Aligned: size=8 bytes (2 bytes padding: 25% overhead)
}
// Memory layout visualization (hex addresses):
// Unaligned:
// 0x00-0x00: a (uint8) [1 byte]
// 0x01-0x03: padding [3 bytes]
// 0x04-0x07: b (uint32) [4 bytes]
// 0x08-0x08: c (uint8) [1 byte]
// 0x09-0x0f: padding [7 bytes]
// Total: 16 bytes
// Aligned:
// 0x00-0x03: b (uint32) [4 bytes]
// 0x04-0x04: a (uint8) [1 byte]
// 0x05-0x05: c (uint8) [1 byte]
// 0x06-0x07: padding [2 bytes]
// Total: 8 bytesFalse Sharing and Cache Line Contention
False sharing occurs when two goroutines on different CPU cores write to different fields in the same 64-byte cache line. The cache coherency protocol forces both cores to invalidate their cache copies, causing expensive synchronization.
package main
import (
"fmt"
"sync"
"sync/atomic"
"testing"
"unsafe"
)
// Unaligned: Multiple fields share cache line (FALSE SHARING)
type UnalignedCounters struct {
c1 uint64 // Offset 0-7
c2 uint64 // Offset 8-15 (SAME cache line!)
c3 uint64 // Offset 16-23
c4 uint64 // Offset 24-31
}
// Aligned: Each field on separate cache line (NO FALSE SHARING)
type CacheLineAligned struct {
c1 uint64
_ [56]byte // Padding to fill 64-byte cache line
c2 uint64
_ [56]byte // Each counter gets its own cache line
c3 uint64
_ [56]byte
c4 uint64
_ [56]byte
}
// Macro-level false sharing (pragmatic approach)
type PragmaticAlignment struct {
// Separate frequently accessed fields
mut sync.Mutex
// Hot fields (accessed every operation)
count uint64
ts int64
// Cold fields (accessed occasionally)
name string
info string
}
func BenchmarkFalseSharing(b *testing.B) {
counters := make([]UnalignedCounters, 4)
b.RunParallel(func(pb *testing.PB) {
i := 0
for pb.Next() {
atomic.AddUint64(&counters[i].c1, 1)
i = (i + 1) % len(counters)
}
})
// Result on 4-core system: ~100M ops/sec
// Each cores ping-pongs the same cache line
}
func BenchmarkCacheLineAlignment(b *testing.B) {
counters := make([]CacheLineAligned, 4)
b.RunParallel(func(pb *testing.PB) {
i := 0
for pb.Next() {
atomic.AddUint64(&counters[i].c1, 1)
i = (i + 1) % len(counters)
}
})
// Result on 4-core system: ~800M+ ops/sec
// 8x improvement due to eliminating cache line contention
}
func BenchmarkFalseSharingImpact(b *testing.B) {
// Demonstrate 2-10x slowdown from false sharing
b.Run("SharedCacheLine", func(b *testing.B) {
// Two goroutines writing adjacent fields on same cache line
type Counter struct {
a uint64 // Offset 0
b uint64 // Offset 8, same cache line
}
counter := &Counter{}
done := make(chan struct{})
go func() {
for i := 0; i < b.N; i++ {
atomic.AddUint64(&counter.a, 1)
}
done <- struct{}{}
}()
for i := 0; i < b.N; i++ {
atomic.AddUint64(&counter.b, 1)
}
<-done
// Result: Slow due to cache line ping-pong
})
b.Run("SeparateCacheLines", func(b *testing.B) {
type Counter struct {
a uint64
_ [56]byte // Padding to separate cache lines
b uint64
}
counter := &Counter{}
done := make(chan struct{})
go func() {
for i := 0; i < b.N; i++ {
atomic.AddUint64(&counter.a, 1)
}
done <- struct{}{}
}()
for i := 0; i < b.N; i++ {
atomic.AddUint64(&counter.b, 1)
}
<-done
// Result: Much faster, each core works independently
})
}
// Calculate minimum padding for cache line alignment
const CacheLineSize = 64
func CacheLinePadding(fieldSize uintptr) uintptr {
return CacheLineSize - (fieldSize % CacheLineSize)
}
func main() {
fmt.Printf("Padding needed for 8-byte field: %d bytes\n", CacheLinePadding(8))
// Output: 56 bytes (56 + 8 = 64)
}Real Memory Layout Analysis
import (
"fmt"
"unsafe"
)
type TimeSeriesRecord struct {
// Hot fields: read/write frequently in inner loops
timestamp int64 // 8 bytes at offset 0
value float64 // 8 bytes at offset 8
flags uint8 // 1 byte at offset 16
_ [7]byte // 7 bytes padding to align next 8-byte field
// Warm fields: accessed but not in inner loop
sourceID uint32 // 4 bytes
_ [4]byte // 4 bytes padding
// Cold fields: accessed rarely
name string // 16 bytes (address only)
metadata string // 16 bytes (address only)
}
func analyzeLayout() {
rec := TimeSeriesRecord{}
size := unsafe.Sizeof(rec)
alignof := unsafe.Alignof(rec)
fmt.Printf("TimeSeriesRecord:\n")
fmt.Printf(" Total size: %d bytes (should be power of 2 ideally)\n", size)
fmt.Printf(" Alignment: %d bytes\n", alignof)
fmt.Printf(" timestamp offset: %d\n", unsafe.Offsetof(rec.timestamp))
fmt.Printf(" value offset: %d\n", unsafe.Offsetof(rec.value))
fmt.Printf(" flags offset: %d\n", unsafe.Offsetof(rec.flags))
fmt.Printf(" sourceID offset: %d\n", unsafe.Offsetof(rec.sourceID))
fmt.Printf(" name offset: %d\n", unsafe.Offsetof(rec.name))
fmt.Printf(" metadata offset: %d\n", unsafe.Offsetof(rec.metadata))
// Output:
// Total size: 80 bytes
// Alignment: 8 bytes
// timestamp offset: 0
// value offset: 8
// flags offset: 16
// sourceID offset: 24
// name offset: 32
// metadata offset: 48
}Hot/Cold Field Separation
Frequently accessed fields should be grouped together for better cache locality. Fields accessed in inner loops should be at the beginning.
package main
import (
"fmt"
"unsafe"
)
// POOR: Hot and cold fields mixed
type MixedLayout struct {
Count int64 // Hot: incremented every operation
Name string // Cold: set once at init
Active bool // Hot: checked every operation
History []string // Cold: appended once per minute
Value float64 // Hot: used in calculations
CreatedAt int64 // Cold: never accessed after init
}
// GOOD: Hot fields first, then cold fields
type OptimizedLayout struct {
// Cache line 1: Hot fields only (64 bytes)
Count int64 // 8 bytes - accessed every operation
Value float64 // 8 bytes - used in calculations
Active bool // 1 byte - checked every operation
_ [47]byte // 47 bytes padding
// Cache line 2+: Cold fields
Name string // 16 bytes - set once
CreatedAt int64 // 8 bytes - never accessed again
History []string // 24 bytes - appended once per minute
}
func demonstrateHotCold() {
mixed := MixedLayout{}
optimized := OptimizedLayout{}
fmt.Printf("MixedLayout size: %d\n", unsafe.Sizeof(mixed))
fmt.Printf("OptimizedLayout size: %d\n", unsafe.Sizeof(optimized))
// MixedLayout: 88 bytes (all fields on same cache line, contention)
// OptimizedLayout: 128 bytes (hot fields isolated, cold fields separate)
// When hot fields are accessed 1000x per cold field access:
// - MixedLayout: cache line ping-pongs, poor performance
// - OptimizedLayout: hot fields in L1 cache, cold fields in L3/main memory
}
// Extreme case: Very hot counter with cold metadata
type HotCounter struct {
// Cache line 1: Just the hot counter
count uint64
_ [56]byte // Padding to fill cache line
// Remaining cache lines: Cold data
name string
timestamp int64
tags []string
}
func OptimalAccessPattern() {
counter := &HotCounter{}
// Accessing counter.count doesn't evict name/timestamp from cache
// This matters in tight loops processing millions of items
_ = counter
}Array of Structs vs Struct of Arrays
Cache locality is affected by data layout patterns. For processing large datasets, struct of arrays often performs better than array of structs.
package main
import (
"fmt"
"math"
"testing"
)
// Array of Structs: Poor cache locality for columnar access
type Point struct {
X, Y, Z float64
}
func ArrayOfStructs(n int) []Point {
points := make([]Point, n)
for i := 0; i < n; i++ {
points[i] = Point{float64(i), float64(i), float64(i)}
}
return points
}
// Struct of Arrays: Excellent cache locality for columnar access
type Points struct {
X, Y, Z []float64
}
func StructOfArrays(n int) Points {
p := Points{
X: make([]float64, n),
Y: make([]float64, n),
Z: make([]float64, n),
}
for i := 0; i < n; i++ {
p.X[i] = float64(i)
p.Y[i] = float64(i)
p.Z[i] = float64(i)
}
return p
}
func BenchmarkCacheLocality(b *testing.B) {
b.Run("ArrayOfStructs", func(b *testing.B) {
points := ArrayOfStructs(10000)
b.ResetTimer()
for i := 0; i < b.N; i++ {
sum := 0.0
for _, p := range points {
// Access X field requires loading entire Point struct
sum += math.Sqrt(p.X*p.X + p.Y*p.Y + p.Z*p.Z)
}
_ = sum
}
// Result: ~300M ops/sec (poor cache locality)
// Each Point is 24 bytes, so each iteration loads multiple points per cache line
// But accessing sequential points causes cache misses for Y and Z
})
b.Run("StructOfArrays", func(b *testing.B) {
points := StructOfArrays(10000)
b.ResetTimer()
for i := 0; i < b.N; i++ {
sum := 0.0
for j := 0; j < len(points.X); j++ {
// Sequential access to X array: perfect cache line utilization
// Each cache line loads 8 float64 values (64 bytes / 8)
sum += math.Sqrt(
points.X[j]*points.X[j] +
points.Y[j]*points.Y[j] +
points.Z[j]*points.Z[j])
}
_ = sum
}
// Result: ~1.2G ops/sec (4x improvement!)
// Sequential access to X array fills entire cache line, minimal misses
})
}
// Decision matrix:
// Use Array of Structs when:
// - Accessing all fields of a struct frequently (hot path uses all fields)
// - Small number of instances (< 1000)
// - Fields are tightly coupled (e.g., Point X/Y/Z)
// Use Struct of Arrays when:
// - Processing large datasets (> 10,000 elements)
// - Columnar access pattern (process all X, then all Y)
// - Machine learning / SIMD operations (vectors of similar types)
// - Cache locality critical for performanceField Alignment Tools
import (
"fmt"
"unsafe"
)
// Manual analysis using unsafe functions
func analyzeStruct(name string, examples ...interface{}) {
if len(examples) == 0 {
return
}
example := examples[0]
fmt.Printf("\n%s Analysis:\n", name)
// Note: In real code, use reflection to iterate fields
// This is simplified for example
fmt.Printf(" Size: %d bytes\n", unsafe.Sizeof(example))
fmt.Printf(" Alignment: %d bytes\n", unsafe.Alignof(example))
}
func main() {
type BadStruct struct {
a uint8
b uint64
c uint8
}
type GoodStruct struct {
b uint64
a uint8
c uint8
}
analyzeStruct("BadStruct", BadStruct{})
analyzeStruct("GoodStruct", GoodStruct{})
// BadStruct: 24 bytes (58% padding overhead)
// GoodStruct: 16 bytes (25% padding overhead)
}Use the golang.org/x/tools/cmd/fieldalignment tool to analyze packages:
go install golang.org/x/tools/cmd/fieldalignment@latest
fieldalignment ./...
fieldalignment -fix ./...Real-World Optimization Examples
Example 1: Time-Series Database Record
package main
import (
"fmt"
"unsafe"
"time"
)
// BEFORE: Poor alignment (96 bytes with 33 bytes padding)
type TimeSeriesBad struct {
ID int32 // 4 bytes at offset 0
// 4 bytes padding (aligns Timestamp to 8)
Timestamp int64 // 8 bytes at offset 8
Value float64 // 8 bytes at offset 16
Labels string // 16 bytes at offset 24
Active bool // 1 byte at offset 40
// 7 bytes padding
Checksum uint32 // 4 bytes at offset 48
// 4 bytes padding
Metadata string // 16 bytes at offset 56
// Total: 72 bytes, but with trailing padding to 96
}
// AFTER: Optimized alignment (64 bytes with 9 bytes padding)
type TimeSeriesGood struct {
Timestamp int64 // 8 bytes at offset 0 (largest field first)
Value float64 // 8 bytes at offset 8
Labels string // 16 bytes at offset 16
ID int32 // 4 bytes at offset 32
Checksum uint32 // 4 bytes at offset 36
Active bool // 1 byte at offset 40
// 7 bytes padding
// Metadata in separate cold struct
}
type TimeSeriesCold struct {
Metadata string // Accessed rarely
}
func demonstrateOptimization() {
bad := TimeSeriesBad{}
good := TimeSeriesGood{}
fmt.Printf("BEFORE: %d bytes (%.0f%% padding overhead)\n",
unsafe.Sizeof(bad),
float64(96-47)/96*100) // 47 bytes of data, 49 padding
fmt.Printf("AFTER: %d bytes (%.0f%% padding overhead)\n",
unsafe.Sizeof(good),
float64(49-40)/49*100) // 40 bytes of data, 9 padding
// Memory saved: 32 bytes (33% reduction)
// For 1 million records: 32 MB reduction in memory usage
// Also: better cache locality, fewer cache line evictions
}Example 2: Request Context Optimization
package main
import (
"net/http"
"unsafe"
)
// POOR: 88 bytes with hot and cold fields mixed
type RequestContextPoor struct {
ID int32 // 4 bytes
Timestamp int64 // 8 bytes (hot)
UserAgent string // 16 bytes (cold)
RemoteAddr string // 16 bytes (cold)
Method string // 16 bytes (warm)
Path string // 16 bytes (warm)
IsSecure bool // 1 byte (hot)
}
// BETTER: 64 bytes with hot fields separated
type RequestContextOptimized struct {
// Cache line 1: Hot fields
Timestamp int64 // 8 bytes
ID int32 // 4 bytes
IsSecure bool // 1 byte
_ [51]byte // Padding to isolate hot fields
// Cache line 2: Warm fields
Method string // 16 bytes
Path string // 16 bytes
_ [32]byte // Padding
// Separate: Cold fields (rarely accessed)
UserAgent string
RemoteAddr string
}
// BEST: Even more separation for max throughput
type RequestContextHot struct {
Timestamp int64
ID int32
IsSecure bool
Method string
Path string
}
type RequestContextCold struct {
UserAgent string
RemoteAddr string
}
func demonstrateRequestOptimization() {
poor := RequestContextPoor{}
optimized := RequestContextOptimized{}
fmt.Printf("POOR: %d bytes\n", unsafe.Sizeof(poor))
fmt.Printf("OPTIMIZED: %d bytes (cold data removed)\n", unsafe.Sizeof(optimized))
// For 100,000 concurrent requests:
// - POOR: 8.8 MB
// - OPTIMIZED: Hot context only 64 bytes = 6.4 MB (27% reduction)
}Atomic Fields Alignment Requirements
Atomic operations on 32-bit systems require 8-byte alignment for 64-bit atomics. On 64-bit systems, any alignment works, but 8-byte alignment is recommended for consistency.
package main
import (
"sync/atomic"
"unsafe"
)
// WRONG: Misaligned atomic field (may panic on 32-bit ARM)
type BadCounter struct {
padding uint8 // 1 byte
// 7 bytes padding
count uint64 // May not be 8-aligned
}
// CORRECT: Ensure atomic fields are 8-aligned
type GoodCounter struct {
count uint64 // 8 bytes at offset 0 (8-aligned)
padding uint8 // 1 byte at offset 8
}
func (c *GoodCounter) Increment() {
atomic.AddUint64(&c.count, 1)
}
func main() {
bad := &BadCounter{}
good := &GoodCounter{}
// This may panic on 32-bit systems:
// atomic.AddUint64(&bad.count, 1)
// This is always safe:
atomic.AddUint64(&good.count, 1)
// Check alignment:
fmt.Printf("BadCounter.count alignment: %d\n", unsafe.Offsetof(bad.count)%8)
fmt.Printf("GoodCounter.count alignment: %d\n", unsafe.Offsetof(good.count)%8)
}When Field Alignment Matters: Decision Tree
// Significant impact when ALL of:
// 1. Large arrays of structs (> 1000 instances)
// 2. Hot path accesses the struct (millions of times per second)
// 3. False sharing possible (concurrent goroutines on multiple cores)
// 4. Memory-constrained environment
// 5. Profiling shows struct size or cache misses as bottleneck
// Negligible impact when ANY of:
// 1. Small number of instances (< 100)
// 2. Infrequent access (< 1000 times per second)
// 3. Single goroutine (no false sharing)
// 4. Memory abundant (no pressure)
// 5. I/O-bound code (network/disk latency dominates)
// Optimization priority:
// 1. Reduce allocation count (most impactful: 2-10x)
// 2. Fix false sharing in concurrent code (2-10x)
// 3. Optimize field order (1.2-1.5x)
// 4. Align to cache lines if >100k instances (1.3-2x)
func optimizationDecision(allocCount int, isHotPath bool, isConcurrent bool, isProfiled bool) string {
if !isProfiled {
return "Profile first before optimizing"
}
if allocCount < 100 {
return "Optimization unlikely to matter"
}
if !isHotPath {
return "Low priority, focus on hot paths"
}
if allocCount < 1000 && !isConcurrent {
return "Optimize field order (modest gains)"
}
if allocCount > 100000 && isConcurrent {
return "Cache line align (significant gains)"
}
return "Standard field ordering recommended"
}Benchmark: Field Alignment Impact
package main
import (
"fmt"
"testing"
"unsafe"
)
type AlignedData struct {
a uint64
b uint32
c uint16
d uint8
}
type UnalignedData struct {
d uint8
c uint16
b uint32
a uint64
}
func BenchmarkAlignedAccess(b *testing.B) {
data := make([]AlignedData, 10000)
for i := range data {
data[i] = AlignedData{uint64(i), uint32(i), uint16(i), uint8(i)}
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
sum := uint64(0)
for j := range data {
sum += data[j].a + uint64(data[j].b) + uint64(data[j].c)
}
_ = sum
}
// Result: ~1.2B ops/sec
// Field layout: a(8), b(4), c(2), d(1) = 15 bytes + 1 padding = 16 bytes
// Sequential access pattern, excellent cache line utilization
}
func BenchmarkUnalignedAccess(b *testing.B) {
data := make([]UnalignedData, 10000)
for i := range data {
data[i] = UnalignedData{uint8(i), uint16(i), uint32(i), uint64(i)}
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
sum := uint64(0)
for j := range data {
sum += data[j].a + uint64(data[j].b) + uint64(data[j].c)
}
_ = sum
}
// Result: ~1.0B ops/sec (10-15% slower)
// Poor field ordering causes cache misses
}
func TestStructSizes(t *testing.T) {
aligned := AlignedData{}
unaligned := UnalignedData{}
fmt.Printf("AlignedData: %d bytes\n", unsafe.Sizeof(aligned))
fmt.Printf("UnalignedData: %d bytes\n", unsafe.Sizeof(unaligned))
// Both: 16 bytes (same size)
// Difference: access pattern and cache behavior
}Summary
Struct field alignment optimization requires understanding CPU cache architecture (64-byte cache lines), padding rules, and false sharing mechanics. Order fields from largest to smallest alignment requirement to minimize padding and improve cache locality. For large arrays of structs (>10,000 elements) with hot access patterns, optimize field order for 20-40% memory savings and 5-15% performance improvement. False sharing between concurrent goroutines writing adjacent fields on the same cache line causes 2-10x slowdown; fix with cache line padding (56 bytes for most fields) when necessary. Use tools like golang.org/x/tools/cmd/fieldalignment to identify optimization opportunities, but always profile first to confirm the optimization matters for your workload. For most applications, reducing allocation count provides far greater benefits than alignment optimization; reserve alignment tuning for latency-critical, data-intensive code paths confirmed via profiling.