1--[[
2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15]]
16-- LuaJIT to BPF bytecode compiler.
17--
18-- The code generation phase is currently one-pass and produces:
19-- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt)
20-- * Variables with liveness analysis and other meta (spill information, compile-time value)
21--
22-- The code generator optimises as much as possible in single pass:
23-- * Fold compile-time expressions and constant propagation
24-- * Basic control flow analysis with dead code elimination (based on compile-time expressions)
25-- * Single-pass optimistic register allocation
26--
27-- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further
28-- optimisations such as:
29-- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used)
30-- * Common sub-expression elimination (relies on DCE and liveness analysis)
31-- * Orphan JMP elimination (removing this in first pass would break previous JMP targets)
32-- * Better register allocation (needs to be recomputed after optimisations)
33
34local ffi = require('ffi')
35local bit = require('bit')
36local S = require('syscall')
37local bytecode = require('bpf.ljbytecode')
38local cdef = require('bpf.cdef')
39local proto = require('bpf.proto')
40local builtins = require('bpf.builtins')
41
42-- Constants
43local ALWAYS, NEVER = -1, -2
44local BPF = ffi.typeof('struct bpf')
45local HELPER = ffi.typeof('struct bpf_func_id')
46
47-- Symbolic table of constant expressions over numbers
48local const_expr = {
49	ADD = function (a, b) return a + b end,
50	SUB = function (a, b) return a - b end,
51	DIV = function (a, b) return a / b end,
52	MOD = function (a, b) return a % b end,
53	JEQ = function (a, b) return a == b end,
54	JNE = function (a, b) return a ~= b end,
55	JGE = function (a, b) return a >= b end,
56	JGT = function (a, b) return a > b end,
57}
58
59local const_width = {
60	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
61}
62
63-- Built-ins that are strict only (never compile-time expandable)
64local builtins_strict = {
65	[ffi.new] = true,
66	[print]   = true,
67}
68
69-- Deep copy a table
70local function table_copy(t)
71	local copy = {}
72	for n,v in pairs(t) do
73		if type(v) == 'table' then
74			v = table_copy(v)
75		end
76		copy[n] = v
77	end
78	return copy
79end
80
81-- Return true if the constant part is a proxy
82local function is_proxy(x)
83	return type(x) == 'table' and (x.__dissector or x.__map or x.__base)
84end
85
86-- Create compiler closure
87local function create_emitter(env, stackslots, params, param_types)
88
89local V = {}   -- Variable tracking / register allocator
90local code = { -- Generated code
91	pc = 0, bc_pc = 0,
92	insn = ffi.new('struct bpf_insn[4096]'),
93	fixup = {},
94	reachable = true,
95	seen_cmp = nil,
96}
97local Vstate = {} -- Track variable layout at basic block exits
98
99-- Anything below this stack offset is free to use by caller
100-- @note: There is no tracking memory allocator, so the caller may
101-- lower it for persistent objects, but such memory will never
102-- be reclaimed and the caller is responsible for resetting stack
103-- top whenever the memory below is free to be reused
104local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t')
105
106local function emit(op, dst, src, off, imm)
107	local ins = code.insn[code.pc]
108	ins.code = op
109	ins.dst_reg = dst
110	ins.src_reg = src
111	ins.off = off
112	ins.imm = imm
113	code.pc = code.pc + 1
114end
115
116local function reg_spill(var)
117	local vinfo = V[var]
118	assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register')
119	vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width)
120	emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0)
121	vinfo.reg = nil
122end
123
124local function reg_fill(var, reg)
125	local vinfo = V[var]
126	assert(reg, 'attempt to fill variable to register but not register is allocated')
127	assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled')
128	emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0)
129	vinfo.reg = reg
130	vinfo.spill = nil
131end
132
133-- Allocate a register (lazy simple allocator)
134local function reg_alloc(var, reg)
135	-- Specific register requested, must spill/move existing variable
136	if reg then
137		for k,v in pairs(V) do -- Spill any variable that has this register
138			if v.reg == reg and not v.shadow then
139				reg_spill(k)
140				break
141			end
142		end
143		return reg
144	end
145	-- Find free or least recently used slot
146	local last, last_seen, used = nil, 0xffff, 0
147	for k,v in pairs(V) do
148		if v.reg then
149			if not v.live_to or v.live_to < last_seen then
150				last, last_seen = k, v.live_to or last_seen
151			end
152			used = bit.bor(used, bit.lshift(1, v.reg))
153		end
154	end
155	-- Attempt to select a free register from R7-R9 (callee saved)
156	local free = bit.bnot(used)
157	if     bit.band(free, 0x80) ~= 0 then reg = 7
158	elseif bit.band(free,0x100) ~= 0 then reg = 8
159	elseif bit.band(free,0x200) ~= 0 then reg = 9
160	end
161	-- Select another variable to be spilled
162	if not reg then
163		assert(last)
164		reg = V[last].reg
165		reg_spill(last)
166	end
167	assert(reg, 'VAR '..var..'fill/spill failed')
168	return reg
169end
170
171-- Set new variable
172local function vset(var, reg, const, vtype)
173	-- Must materialise all variables shadowing this variable slot, as it will be overwritten
174	if V[var] and V[var].reg then
175		for _, vinfo in pairs(V) do
176			-- Shadowing variable MUST share the same type and attributes,
177			-- but the register assignment may have changed
178			if vinfo.shadow == var then
179				vinfo.reg = V[var].reg
180				vinfo.shadow = nil
181			end
182		end
183	end
184	-- Get precise type for CDATA or attempt to narrow numeric constant
185	if not vtype and type(const) == 'cdata' then
186		vtype = ffi.typeof(const)
187	end
188	V[var] = {reg=reg, const=const, type=vtype}
189	-- Track variable source
190	if V[var].const and type(const) == 'table' then
191		V[var].source = V[var].const.source
192	end
193end
194
195-- Materialize (or register) a variable in a register
196-- If the register is nil, then the a new register is assigned (if not already assigned)
197local function vreg(var, reg, reserve, vtype)
198	local vinfo = V[var]
199	assert(vinfo, 'VAR '..var..' not registered')
200	vinfo.live_to = code.pc-1
201	if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end
202	reg = reg_alloc(var, reg)
203	-- Materialize variable shadow copy
204	local src = vinfo
205	while src.shadow do src = V[src.shadow] end
206	if reserve then -- luacheck: ignore
207		-- No load to register occurs
208	elseif src.reg then
209		emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0)
210	elseif src.spill then
211		vinfo.spill = src.spill
212		reg_fill(var, reg)
213	elseif src.const then
214		vtype = vtype or src.type
215		if type(src.const) == 'table' and src.const.__base then
216			-- Load pointer type
217			emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0)
218			emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base)
219		elseif type(src.const) == 'table' and src.const.__dissector then
220			-- Load dissector offset (imm32), but keep the constant part (dissector proxy)
221			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0)
222		elseif vtype and ffi.sizeof(vtype) == 8 then
223			-- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
224			emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const))
225			emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16)))
226			vinfo.const = nil -- The variable is live
227		else
228			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const)
229			vinfo.const = nil -- The variable is live
230		end
231	else assert(false, 'VAR '..var..' has neither register nor constant value') end
232	vinfo.reg = reg
233	vinfo.shadow = nil
234	vinfo.live_from = code.pc-1
235	vinfo.type = vtype or vinfo.type
236	return reg
237end
238
239-- Copy variable
240local function vcopy(dst, src)
241	if dst == src then return end
242	V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type}
243end
244
245-- Dereference variable of pointer type
246local function vderef(dst_reg, src_reg, vinfo)
247	-- Dereference map pointers for primitive types
248	-- BPF doesn't allow pointer arithmetics, so use the entry value
249	assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable')
250	local vtype = vinfo.const.__dissector
251	local w = ffi.sizeof(vtype)
252	assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes')
253	if dst_reg ~= src_reg then
254		emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0)    -- dst = src
255	end
256	-- Optimize the NULL check away if provably not NULL
257	if not vinfo.source or vinfo.source:find('_or_null', 1, true) then
258		emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0)            -- if (src != NULL)
259	end
260	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) --     dst = *src;
261end
262
263-- Allocate a space for variable
264local function valloc(size, blank)
265	local base = stack_top
266	assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB')
267	stack_top = stack_top + size
268	-- Align to 8 byte boundary
269	stack_top = math.ceil(stack_top/8)*8
270	-- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK
271	-- so we always need to have memory initialized, remove this when supported
272	if blank then
273		if type(blank) == 'string' then
274			local sp = 0
275			while sp < size do
276				-- TODO: no BPF_ST + BPF_DW instruction yet
277				local as_u32 = ffi.new('uint32_t [1]')
278				local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32))
279				ffi.copy(as_u32, sub, #sub)
280				emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0])
281				sp = sp + ffi.sizeof(as_u32)
282			end
283		elseif type(blank) == 'boolean' then
284			reg_alloc(stackslots, 0)
285			emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
286			for sp = base+8,stack_top,8 do
287				emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0)
288			end
289		else error('NYI: will with unknown type '..type(blank)) end
290	end
291	return stack_top
292end
293
294-- Turn variable into scalar in register (or constant)
295local function vscalar(a, w)
296	assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8')
297	local src_reg
298	-- If source is a pointer, we must dereference it first
299	if cdef.isptr(V[a].type) then
300		src_reg = vreg(a)
301		local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register
302		emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0)
303		vderef(tmp_reg, tmp_reg, V[a])
304		src_reg = tmp_reg -- Materialize and dereference it
305	-- Source is a value on stack, we must load it first
306	elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then
307		src_reg = vreg(a)
308		emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0)
309		V[a].type = V[a].const.__dissector
310		V[a].const = nil -- Value is dereferenced
311	-- If source is an imm32 number, avoid register load
312	elseif type(V[a].const) == 'number' and w < 8 then
313		return nil, V[a].const
314	-- Load variable from any other source
315	else
316		src_reg = vreg(a)
317	end
318
319	return src_reg, nil
320end
321
322-- Emit compensation code at the end of basic block to unify variable set layout on all block exits
323-- 1. we need to free registers by spilling
324-- 2. fill registers to match other exits from this BB
325local function bb_end(Vcomp)
326	for i,v in pairs(V) do
327		if Vcomp[i] and Vcomp[i].spill and not v.spill then
328			-- Materialize constant or shadowing variable to be able to spill
329			if not v.reg and (v.shadow or cdef.isimmconst(v)) then
330				vreg(i)
331			end
332			reg_spill(i)
333		end
334	end
335	for i,v in pairs(V) do
336		if Vcomp[i] and Vcomp[i].reg and not v.reg then
337			vreg(i, Vcomp[i].reg)
338		end
339		-- Compensate variable metadata change
340		if Vcomp[i] and Vcomp[i].source then
341			V[i].source = Vcomp[i].source
342		end
343	end
344end
345
346local function CMP_STR(a, b, op)
347	assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=')
348	-- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP
349	-- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ...
350	--     EQ(a,b) <=> X == 0
351	-- This could be optimised by placing early exits by rewriter in second phase for long strings
352	local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type))
353	local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1)
354	local sp = 0
355	emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0)
356	while sp < size do
357		-- Load string chunk as imm32
358		local as_u32 = ffi.new('uint32_t [1]')
359		local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32))
360		ffi.copy(as_u32, sub, #sub)
361		-- TODO: make this faster by interleaved load/compare steps with DW length
362		emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0)
363		emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0])
364		emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0)
365		sp = sp + ffi.sizeof(as_u32)
366	end
367	emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0)
368	code.seen_cmp = code.pc-1
369end
370
371local function CMP_REG(a, b, op)
372	-- Fold compile-time expressions
373	if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
374		code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER
375	else
376		-- Comparison against compile-time string or stack memory
377		if V[b].const and type(V[b].const) == 'string' then
378			return CMP_STR(a, V[b].const, op)
379		end
380		-- The 0xFFFF target here has no significance, it's just a placeholder for
381		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
382		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
383		local a_reg, b_reg = vreg(a), vreg(b)
384		emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0)
385		code.seen_cmp = code.pc-1
386	end
387end
388
389local function CMP_IMM(a, b, op)
390	local c = V[a].const
391	if c and not is_proxy(c) then -- Fold compile-time expressions
392		code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER
393	else
394		-- Convert imm32 to number
395		if type(b) == 'string' then
396			if     #b == 1 then b = b:byte()
397			elseif cdef.isptr(V[a].type) then
398				-- String comparison between stack/constant string
399				return CMP_STR(a, b, op)
400			elseif #b <= 4 then
401				-- Convert to u32 with network byte order
402				local imm = ffi.new('uint32_t[1]')
403				ffi.copy(imm, b, #b)
404				b = builtins.hton(imm[0])
405			else error('NYI: compare register with string, where #string > sizeof(u32)') end
406		end
407		-- The 0xFFFF target here has no significance, it's just a placeholder for
408		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
409		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
410		local reg = vreg(a)
411		emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b)
412		code.seen_cmp = code.pc-1
413		-- Remember NULL pointer checks as BPF prohibits pointer comparisons
414		-- and repeated checks wouldn't pass the verifier, only comparisons
415		-- against constants are checked.
416		if op == 'JEQ' and tonumber(b) == 0 and V[a].source then
417			local pos = V[a].source:find('_or_null', 1, true)
418			if pos then
419				code.seen_null_guard = a
420			end
421		-- Inverse NULL pointer check (if a ~= nil)
422		elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then
423			local pos = V[a].source:find('_or_null', 1, true)
424			if pos then
425				code.seen_null_guard = a
426				code.seen_null_guard_inverse = true
427			end
428		end
429	end
430end
431
432local function ALU_IMM(dst, a, b, op)
433	-- Fold compile-time expressions
434	if V[a].const and not is_proxy(V[a].const) then
435			assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
436			vset(dst, nil, const_expr[op](V[a].const, b))
437	-- Now we need to materialize dissected value at DST, and add it
438	else
439		vcopy(dst, a)
440		local dst_reg = vreg(dst)
441		if cdef.isptr(V[a].type) then
442			vderef(dst_reg, dst_reg, V[a])
443			V[dst].type = V[a].const.__dissector
444		else
445			V[dst].type = V[a].type
446		end
447		emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b)
448	end
449end
450
451local function ALU_REG(dst, a, b, op)
452	-- Fold compile-time expressions
453	if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
454		assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
455		assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric')
456		if type(op) == 'string' then op = const_expr[op] end
457		vcopy(dst, a)
458		V[dst].const = op(V[a].const, V[b].const)
459	else
460		local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations
461		if b and cdef.isptr(V[b].type) then
462			-- We have to allocate a temporary register for dereferencing to preserve
463			-- pointer in source variable that MUST NOT be altered
464			reg_alloc(stackslots, 2)
465			vderef(2, src_reg, V[b])
466			src_reg = 2
467		end
468		vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B
469		local dst_reg = vreg(dst)
470		if cdef.isptr(V[a].type) then
471			vderef(dst_reg, dst_reg, V[a])
472			V[dst].type = V[a].const.__dissector
473		end
474		emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0)
475		V[stackslots].reg = nil  -- Free temporary registers
476	end
477end
478
479local function ALU_IMM_NV(dst, a, b, op)
480	-- Do DST = IMM(a) op VAR(b) where we can't invert because
481	-- the registers are u64 but immediates are u32, so complement
482	-- arithmetics wouldn't work
483	vset(stackslots+1, nil, a)
484	ALU_REG(dst, stackslots+1, b, op)
485end
486
487local function LD_ABS(dst, w, off)
488	assert(off, 'LD_ABS called without offset')
489	if w < 8 then
490		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
491		emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off)
492		if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
493			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
494		end
495	elseif w == 8 then
496		-- LD_ABS|IND prohibits DW, we need to do two W loads and combine them
497		local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0
498		emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4)
499		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
500			emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32)
501		end
502		ALU_IMM(stackslots, stackslots, 32, 'LSH')
503		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable
504		emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off)
505		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
506			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32)
507		end
508		ALU_REG(dst, dst, stackslots, 'OR')
509		V[stackslots].reg = nil -- Free temporary registers
510	else
511		assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported')
512	end
513end
514
515local function LD_IND(dst, src, w, off)
516	local src_reg = vreg(src) -- Must materialize first in case dst == src
517	local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
518	emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0)
519	if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
520		emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
521	end
522end
523
524local function LD_MEM(dst, src, w, off)
525	local src_reg = vreg(src) -- Must materialize first in case dst == src
526	local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0
527	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0)
528end
529
530-- @note: This is specific now as it expects registers reserved
531local function LD_IMM_X(dst_reg, src_type, imm, w)
532	if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
533		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm))
534		-- Must shift in two steps as bit.lshift supports [0..31]
535		emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16)))
536	else
537		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm)
538	end
539end
540
541local function BUILTIN(func, ...)
542	local builtin_export = {
543		-- Compiler primitives (work with variable slots, emit instructions)
544		V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit,
545		reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width,
546		-- Extensions and helpers (use with care)
547		LD_IMM_X = LD_IMM_X,
548	}
549	func(builtin_export, ...)
550end
551
552local function LOAD(dst, src, off, vtype)
553	local base = V[src].const
554	assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector')
555	assert(V[src].source, 'NYI: load() on variable with unknown source')
556	-- Cast to different type if requested
557	vtype = vtype or base.__dissector
558	local w = ffi.sizeof(vtype)
559	assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w))
560	-- Packet access with a dissector (use BPF_LD)
561	if V[src].source:find('ptr_to_pkt', 1, true) then
562		if base.off then -- Absolute address to payload
563			LD_ABS(dst, w, off + base.off)
564		else -- Indirect address to payload
565			LD_IND(dst, src, w, off)
566		end
567	-- Direct access to first argument (skb fields, pt regs, ...)
568	elseif V[src].source:find('ptr_to_ctx', 1, true) then
569		LD_MEM(dst, src, w, off)
570	-- Direct skb access with a dissector (use BPF_MEM)
571	elseif V[src].source:find('ptr_to_skb', 1, true) then
572		LD_MEM(dst, src, w, off)
573	-- Pointer to map-backed memory (use BPF_MEM)
574	elseif V[src].source:find('ptr_to_map_value', 1, true) then
575		LD_MEM(dst, src, w, off)
576	-- Indirect read using probe (uprobe or kprobe, uses helper)
577	elseif V[src].source:find('ptr_to_probe', 1, true) then
578		BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off)
579		V[dst].source = V[src].source -- Builtin handles everything
580	else
581		error('NYI: load() on variable from ' .. V[src].source)
582	end
583	V[dst].type = vtype
584	V[dst].const = nil -- Dissected value is not constant anymore
585end
586
587local function CALL(a, b, d)
588	assert(b-1 <= 1, 'NYI: CALL with >1 return values')
589	-- Perform either compile-time, helper, or builtin
590	local func = V[a].const
591	-- Gather all arguments and check if they're constant
592	local args, const, nargs = {}, true, d - 1
593	for i = a+1, a+d-1 do
594		table.insert(args, V[i].const)
595		if not V[i].const or is_proxy(V[i].const) then const = false end
596	end
597	local builtin = builtins[func]
598	if not const or nargs == 0 then
599		if builtin and type(builtin) == 'function' then
600			args = {a}
601			for i = a+1, a+nargs do table.insert(args, i) end
602			BUILTIN(builtin, unpack(args))
603		elseif V[a+2] and V[a+2].const then -- var OP imm
604			ALU_IMM(a, a+1, V[a+2].const, builtin)
605		elseif nargs <= 2 then              -- var OP var
606			ALU_REG(a, a+1, V[a+2] and a+2, builtin)
607		else
608			error('NYI: CALL non-builtin with 3 or more arguments')
609		end
610	-- Call on dissector implies slice retrieval
611	elseif type(func) == 'table' and func.__dissector then
612		assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments')
613		assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant')
614		local off = V[a+1].const
615		local vtype = builtins.width_type(V[a+2].const - off)
616		-- Access to packet via packet (use BPF_LD)
617		if V[a].source and V[a].source:find('ptr_to_', 1, true) then
618			LOAD(a, a, off, vtype)
619		else
620			error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown'))
621		end
622	-- Strict builtins cannot be expanded on compile-time
623	elseif builtins_strict[func] and builtin then
624		args = {a}
625		for i = a+1, a+nargs do table.insert(args, i) end
626		BUILTIN(builtin, unpack(args))
627	-- Attempt compile-time call expansion (expects all argument compile-time known)
628	else
629		assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant')
630		V[a].const = func(unpack(args))
631	end
632end
633
634local function MAP_INIT(map_var, key, imm)
635	local map = V[map_var].const
636	vreg(map_var, 1, true, ffi.typeof('uint64_t'))
637	-- Reserve R1 and load ptr for process-local map fd
638	LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type))
639	V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation
640	-- Reserve R2 and load R2 = key pointer
641	local key_size = ffi.sizeof(map.key_type)
642	local w = const_width[key_size] or BPF.DW
643	local pod_type = const_width[key_size]
644	local sp = stack_top + key_size -- Must use stack below spill slots
645	-- Store immediate value on stack
646	reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable)
647	local key_base = key and V[key].const
648	imm = imm or key_base
649	if imm and (not key or not is_proxy(key_base)) then
650		assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8')
651		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm)
652	-- Key is in register, spill it
653	elseif V[key].reg and pod_type then
654		if cdef.isptr(V[key].type) then
655			-- There is already pointer in register, dereference before spilling
656			emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0)
657			emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0)
658		else -- Variable in register is POD, spill it on the stack
659			emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0)
660		end
661	-- Key is spilled from register to stack
662	elseif V[key].spill then
663		sp = V[key].spill
664	-- Key is already on stack, write to base-relative address
665	elseif key_base.__base then
666		assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type')
667		sp = key_base.__base
668	else
669		error('VAR '..key..' is neither const-expr/register/stack/spilled')
670	end
671	-- If [FP+K] addressing, emit it
672	if sp then
673		emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0)
674		emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp)
675	end
676end
677
678local function MAP_GET(dst, map_var, key, imm)
679	local map = V[map_var].const
680	MAP_INIT(map_var, key, imm)
681	-- Flag as pointer type and associate dissector for map value type
682	vreg(dst, 0, true, ffi.typeof('uint8_t *'))
683	V[dst].const = {__dissector=map.val_type}
684	V[dst].source = 'ptr_to_map_value_or_null'
685	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem)
686	V[stackslots].reg = nil -- Free temporary registers
687end
688
689local function MAP_DEL(map_var, key, key_imm)
690	-- Set R0, R1 (map fd, preempt R0)
691	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
692	MAP_INIT(map_var, key, key_imm)
693	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem)
694	V[stackslots].reg = nil -- Free temporary registers
695end
696
697local function MAP_SET(map_var, key, key_imm, src)
698	local map = V[map_var].const
699	-- Delete when setting nil
700	if V[src].type == ffi.typeof('void') then
701		return MAP_DEL(map_var, key, key_imm)
702	end
703	-- Set R0, R1 (map fd, preempt R0)
704	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
705	MAP_INIT(map_var, key, key_imm)
706	reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable)
707	emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing
708	-- Reserve R3 for value pointer
709	reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable)
710	local val_size = ffi.sizeof(map.val_type)
711	local w = const_width[val_size] or BPF.DW
712	local pod_type = const_width[val_size]
713	-- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value)
714	local sp = stack_top + ffi.sizeof(map.key_type) + val_size
715	sp = sp + (sp % val_size)
716	local base = V[src].const
717	if base and not is_proxy(base) then
718		assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8')
719		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base)
720	-- Value is in register, spill it
721	elseif V[src].reg and pod_type then
722		-- Value is a pointer, derefernce it and spill it
723		if cdef.isptr(V[src].type) then
724			vderef(3, V[src].reg, V[src])
725			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
726		else
727			emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0)
728		end
729	-- We get a pointer to spilled register on stack
730	elseif V[src].spill then
731		-- If variable is a pointer, we can load it to R3 directly (save "LEA")
732		if cdef.isptr(V[src].type) then
733			reg_fill(src, 3)
734			-- If variable is a stack pointer, we don't have to check it
735			if base.__base then
736				emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
737				return
738			end
739			vderef(3, V[src].reg, V[src])
740			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
741		else
742			sp = V[src].spill
743		end
744	-- Value is already on stack, write to base-relative address
745	elseif base.__base then
746		if val_size ~= ffi.sizeof(V[src].type) then
747			local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d',
748				src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type))
749			error(err)
750		end
751		sp = base.__base
752	-- Value is constant, materialize it on stack
753	else
754		error('VAR '.. src ..' is neither const-expr/register/stack/spilled')
755	end
756	emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
757	emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp)
758	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
759	V[stackslots].reg = nil -- Free temporary registers
760end
761
762-- Finally - this table translates LuaJIT bytecode into code emitter actions.
763local BC = {
764	-- Constants
765	KNUM = function(a, _, c, _) -- KNUM
766		if c < 2147483648 then
767			vset(a, nil, c, ffi.typeof('int32_t'))
768		else
769			vset(a, nil, c, ffi.typeof('uint64_t'))
770		end
771	end,
772	KSHORT = function(a, _, _, d) -- KSHORT
773		vset(a, nil, d, ffi.typeof('int16_t'))
774	end,
775	KCDATA = function(a, _, c, _) -- KCDATA
776		-- Coerce numeric types if possible
777		local ct = ffi.typeof(c)
778		if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then
779			vset(a, nil, c, ct)
780		elseif tonumber(c) ~= nil then
781			-- TODO: this should not be possible
782			vset(a, nil, tonumber(c), ct)
783		else
784			error('NYI: cannot use CDATA constant of type ' .. ct)
785		end
786	end,
787	KPRI = function(a, _, _, d) -- KPRI
788		-- KNIL is 0, must create a special type to identify it
789		local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t')
790		vset(a, nil, (d < 2) and 0 or 1, vtype)
791	end,
792	KSTR = function(a, _, c, _) -- KSTR
793		vset(a, nil, c, ffi.typeof('const char[?]'))
794	end,
795	MOV = function(a, _, _, d) -- MOV var, var
796		vcopy(a, d)
797	end,
798
799	-- Comparison ops
800	-- Note: comparisons are always followed by JMP opcode, that
801	--       will fuse following JMP to JMP+CMP instruction in BPF
802	-- Note:  we're narrowed to integers, so operand/operator inversion is legit
803	ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted)
804	ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d)
805	ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d)
806	ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d)
807	ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d)
808	ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c))
809	ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c))
810	ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c)
811	ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c)
812	IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d)
813	ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d)
814	ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c)
815	-- Binary operations with RHS constants
816	ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end,
817	SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end,
818	MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end,
819	DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end,
820	MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end,
821	-- Binary operations with LHS constants
822	-- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative
823	ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV
824	MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV
825	SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV
826	DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV
827	-- Binary operations between registers
828	ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end,
829	SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end,
830	MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end,
831	DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end,
832	MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end,
833	-- Strings
834	CAT = function(a, b, _, d) -- CAT A = B ~ D
835		assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions')
836		assert(type(V[b].const) == 'string' and type(V[d].const) == 'string',
837			'NYI: CAT only works on compile-time strings')
838		vset(a, nil, V[b].const .. V[d].const)
839	end,
840	-- Tables
841	GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c])
842		if env[c] ~= nil then
843			vset(a, nil, env[c])
844		else error(string.format("undefined global '%s'", c)) end
845	end,
846	UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c])
847		if env[c] ~= nil then
848			vset(a, nil, env[c])
849		else error(string.format("undefined upvalue '%s'", c)) end
850	end,
851	TSETB = function (a, b, _, d) -- TSETB (B[D] = A)
852		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
853		local vinfo = V[b].const
854		if vinfo.__map then -- BPF map read (constant)
855			return MAP_SET(b, nil, d, a) -- D is literal
856		elseif vinfo.__dissector then
857			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
858			local w = ffi.sizeof(vinfo.__dissector)
859			-- TODO: support vectorized moves larger than register width
860			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
861			local src_reg, const = vscalar(a, w)
862			-- If changing map value, write to absolute address + offset
863			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
864				local dst_reg = vreg(b)
865				-- Optimization: immediate values (imm32) can be stored directly
866				if type(const) == 'number' then
867					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const)
868				else
869					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0)
870				end
871			-- Table is already on stack, write to vinfo-relative address
872			elseif vinfo.__base then
873				-- Optimization: immediate values (imm32) can be stored directly
874				if type(const) == 'number' then
875					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const)
876				else
877					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0)
878				end
879			else
880				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
881			end
882		elseif vinfo and vinfo and V[a].const then
883			vinfo[V[d].const] = V[a].const
884		else
885			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
886		end
887	end,
888	TSETV = function (a, b, _, d) -- TSETV (B[D] = A)
889		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
890		local vinfo = V[b].const
891		if vinfo.__map then -- BPF map read (constant)
892			return MAP_SET(b, d, nil, a) -- D is variable
893		elseif vinfo.__dissector then
894			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
895			local w = ffi.sizeof(vinfo.__dissector)
896			-- TODO: support vectorized moves larger than register width
897			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
898			local src_reg, const = vscalar(a, w)
899			-- If changing map value, write to absolute address + offset
900			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
901				-- Calculate variable address from two registers
902				local tmp_var = stackslots + 1
903				vset(tmp_var, nil, d)
904				ALU_REG(tmp_var, tmp_var, b, 'ADD')
905				local dst_reg = vreg(tmp_var)
906				V[tmp_var].reg = nil -- Only temporary allocation
907				-- Optimization: immediate values (imm32) can be stored directly
908				if type(const) == 'number' and w < 8 then
909					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const)
910				else
911					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0)
912				end
913			-- Table is already on stack, write to vinfo-relative address
914			elseif vinfo.__base then
915				-- Calculate variable address from two registers
916				local tmp_var = stackslots + 1
917				vcopy(tmp_var, d)                       -- Element position
918				if w > 1 then
919					ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size
920				end
921				local dst_reg = vreg(tmp_var)           -- add R10 (stack pointer)
922				emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0)
923				V[tmp_var].reg = nil -- Only temporary allocation
924				-- Optimization: immediate values (imm32) can be stored directly
925				if type(const) == 'number' and w < 8 then
926					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const)
927				else
928					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0)
929				end
930			else
931				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
932			end
933		elseif vinfo and V[d].const and V[a].const then
934			vinfo[V[d].const] = V[a].const
935		else
936			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
937		end
938	end,
939	TSETS = function (a, b, c, _) -- TSETS (B[C] = A)
940		assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
941		local base = V[b].const
942		if base.__dissector then
943			local ofs,bpos = ffi.offsetof(base.__dissector, c)
944			assert(not bpos, 'NYI: B[C] = A, where C is a bitfield')
945			local w = builtins.sizeofattr(base.__dissector, c)
946			-- TODO: support vectorized moves larger than register width
947			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
948			local src_reg, const = vscalar(a, w)
949			-- If changing map value, write to absolute address + offset
950			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
951				local dst_reg = vreg(b)
952				-- Optimization: immediate values (imm32) can be stored directly
953				if type(const) == 'number' and w < 8 then
954					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const)
955				else
956					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0)
957				end
958			-- Table is already on stack, write to base-relative address
959			elseif base.__base then
960				-- Optimization: immediate values (imm32) can be stored directly
961				if type(const) == 'number' and w < 8 then
962					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const)
963				else
964					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0)
965				end
966			else
967				error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
968			end
969		elseif V[a].const then
970			base[c] = V[a].const
971		else
972			error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
973		end
974	end,
975	TGETB = function (a, b, _, d) -- TGETB (A = B[D])
976		local base = V[b].const
977		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
978		if a ~= b then vset(a) end
979		if base.__map then -- BPF map read (constant)
980			MAP_GET(a, b, nil, d)
981		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
982		elseif V[b].source and V[b].source:find('ptr_to_') then
983			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
984			LOAD(a, b, d, vtype)
985		-- Specialise PTR[0] as dereference operator
986		elseif cdef.isptr(V[b].type) and d == 0 then
987			vcopy(a, b)
988			local dst_reg = vreg(a)
989			vderef(dst_reg, dst_reg, V[a])
990			V[a].type = V[a].const.__dissector
991		else
992			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
993		end
994	end,
995	TGETV = function (a, b, _, d) -- TGETV (A = B[D])
996		local base = V[b].const
997		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
998		if a ~= b then vset(a) end
999		if base.__map then -- BPF map read
1000			MAP_GET(a, b, d)
1001		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
1002		elseif V[b].source and V[b].source:find('ptr_to_') then
1003			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
1004			LOAD(a, b, d, vtype)
1005		-- Constant dereference
1006		elseif type(V[d].const) == 'number' then
1007			V[a].const = base[V[d].const]
1008		else
1009			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
1010		end
1011	end,
1012	TGETS = function (a, b, c, _) -- TGETS (A = B[C])
1013		local base = V[b].const
1014		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
1015		if a ~= b then vset(a) end
1016		if base.__dissector then
1017			local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
1018			-- Resolve table key using metatable
1019			if not ofs and type(base.__dissector[c]) == 'string' then
1020				c = base.__dissector[c]
1021				ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
1022			end
1023			if not ofs and proto[c] then -- Load new dissector on given offset
1024				BUILTIN(proto[c], a, b, c)
1025			else
1026				-- Loading register from offset is a little bit tricky as there are
1027				-- several data sources and value loading modes with different restrictions
1028				-- such as checking pointer values for NULL compared to using stack.
1029				assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists')
1030				if a ~= b then vset(a) end
1031				-- Dissected value is probably not constant anymore
1032				local new_const = nil
1033				local w, atype = builtins.sizeofattr(base.__dissector, c)
1034				-- [SP+K] addressing using R10 (stack pointer)
1035				-- Doesn't need to be checked for NULL
1036				if base.__base and base.__base > 0 then
1037					if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset
1038						new_const = {__base = base.__base-ofs}
1039					else
1040						local dst_reg = vreg(a, nil, true)
1041						emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0)
1042					end
1043				-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
1044				elseif V[b].source and V[b].source:find('ptr_to_') then
1045					LOAD(a, b, ofs, atype)
1046				else
1047					error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
1048				end
1049				-- Bitfield, must be further narrowed with a bitmask/shift
1050				if bpos then
1051					local mask = 0
1052					for i=bpos+1,bpos+bsize do
1053						mask = bit.bor(mask, bit.lshift(1, w*8-i))
1054					end
1055					emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask)
1056					-- Free optimization: single-bit values need just boolean result
1057					if bsize > 1 then
1058						local shift = w*8-bsize-bpos
1059						if shift > 0 then
1060							emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift)
1061						end
1062					end
1063				end
1064				V[a].type = atype
1065				V[a].const = new_const
1066				V[a].source = V[b].source
1067				-- Track direct access to skb data
1068				-- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access"
1069				if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then
1070					-- Direct access to skb uses skb->data and skb->data_end
1071					-- which are encoded as u32, but are actually pointers
1072					if c == 'data' or c == 'data_end' then
1073						V[a].const = {__dissector = ffi.typeof('uint8_t')}
1074						V[a].source = 'ptr_to_skb'
1075					end
1076				end
1077			end
1078		else
1079			V[a].const = base[c]
1080		end
1081	end,
1082	-- Loops and branches
1083	CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES)
1084		-- NYI: Support single result only
1085		CALL(a, b, d+2)
1086	end,
1087	CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1)
1088		CALL(a, b, d)
1089	end,
1090	JMP = function (a, _, c, _) -- JMP
1091		-- Discard unused slots after jump
1092		for i, _ in pairs(V) do
1093			if i >= a and i < stackslots then
1094				V[i] = nil
1095			end
1096		end
1097		-- Cross basic block boundary if the jump target isn't provably unreachable
1098		local val = code.fixup[c] or {}
1099		if code.seen_cmp and code.seen_cmp ~= ALWAYS then
1100			if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup
1101				-- Store previous CMP insn for reemitting after compensation code
1102				local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1])
1103				code.pc = code.pc - 1
1104				-- First branch point, emit compensation code
1105				local Vcomp = Vstate[c]
1106				if not Vcomp then
1107					-- Select scratch register (R0-5) that isn't used as operand
1108					-- in the CMP instruction, as the variable may not be live, after
1109					-- the JMP, but it may be used in the JMP+CMP instruction itself
1110					local tmp_reg = 0
1111					for reg = 0, 5 do
1112						if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then
1113							tmp_reg = reg
1114							break
1115						end
1116					end
1117					-- Force materialization of constants at the end of BB
1118					for i, v in pairs(V) do
1119						if not v.reg and cdef.isimmconst(v) then
1120							vreg(i, tmp_reg) -- Load to TMP register (not saved)
1121							reg_spill(i) -- Spill caller-saved registers
1122						end
1123					end
1124					-- Record variable state
1125					Vstate[c] = V
1126					Vcomp = V
1127					V = table_copy(V)
1128				-- Variable state already set, emit specific compensation code
1129				else
1130					bb_end(Vcomp)
1131				end
1132				-- Record pointer NULL check from condition
1133				-- If the condition checks pointer variable against NULL,
1134				-- we can assume it will not be NULL in the fall-through block
1135				if code.seen_null_guard then
1136					local var = code.seen_null_guard
1137					-- The null guard can have two forms:
1138					--   if x == nil then goto
1139					--   if x ~= nil then goto
1140					-- First form guarantees that the variable will be non-nil on the following instruction
1141					-- Second form guarantees that the variable will be non-nil at the jump target
1142					local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var]
1143					if vinfo.source then
1144						local pos = vinfo.source:find('_or_null', 1, true)
1145						if pos then
1146							vinfo.source = vinfo.source:sub(1, pos - 1)
1147						end
1148					end
1149				end
1150				-- Reemit CMP insn
1151				emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm)
1152				-- Fuse JMP into previous CMP opcode, mark JMP target for fixup
1153				-- as we don't knot the relative offset in generated code yet
1154				table.insert(val, code.pc-1)
1155				code.fixup[c] = val
1156			end
1157			code.seen_cmp = nil
1158			code.seen_null_guard = nil
1159			code.seen_null_guard_inverse = nil
1160		elseif c == code.bc_pc + 1 then -- luacheck: ignore 542
1161			-- Eliminate jumps to next immediate instruction
1162			-- e.g. 0002    JMP      1 => 0003
1163		else
1164			-- We need to synthesise a condition that's always true, however
1165			-- BPF prohibits pointer arithmetic to prevent pointer leaks
1166			-- so we have to clear out one register and use it for cmp that's always true
1167			local dst_reg = reg_alloc(stackslots)
1168			V[stackslots].reg = nil -- Only temporary allocation
1169			-- First branch point, emit compensation code
1170			local Vcomp = Vstate[c]
1171			if not Vcomp then
1172				-- Force materialization of constants at the end of BB
1173				for i, v in pairs(V) do
1174					if not v.reg and cdef.isimmconst(v) then
1175						vreg(i, dst_reg) -- Load to TMP register (not saved)
1176						reg_spill(i) -- Spill caller-saved registers
1177					end
1178				end
1179				-- Record variable state
1180				Vstate[c] = V
1181				V = table_copy(V)
1182			-- Variable state already set, emit specific compensation code
1183			else
1184				bb_end(Vcomp)
1185			end
1186			emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0)
1187			emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0)
1188			table.insert(val, code.pc-1) -- Fixup JMP target
1189			code.reachable = false -- Code following the JMP is not reachable
1190			code.fixup[c] = val
1191		end
1192	end,
1193	RET1 = function (a, _, _, _) -- RET1
1194		-- Free optimisation: spilled variable will not be filled again
1195		for i, v in pairs(V) do
1196			if i ~= a then v.reg = nil end
1197		end
1198		if V[a].reg ~= 0 then vreg(a, 0) end
1199		-- Convenience: dereference pointer variables
1200		-- e.g. 'return map[k]' will return actual map value, not pointer
1201		if cdef.isptr(V[a].type) then
1202			vderef(0, 0, V[a])
1203		end
1204		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
1205		code.reachable = false
1206	end,
1207	RET0 = function (_, _, _, _) -- RET0
1208		emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
1209		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
1210		code.reachable = false
1211	end,
1212	compile = function ()
1213		return code
1214	end
1215}
1216
1217-- Composite instructions
1218function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1)
1219	CALL(a, 1, d)
1220	BC.RET1(a)
1221end
1222
1223-- Always initialize R6 with R1 context
1224emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0)
1225-- Register R6 as context variable (first argument)
1226if params and params > 0 then
1227	vset(0, 6, param_types[1] or proto.skb)
1228	assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo
1229end
1230-- Register tmpvars
1231vset(stackslots)
1232vset(stackslots+1)
1233return setmetatable(BC, {
1234	__index = function (_, k, _)
1235		if type(k) == 'number' then
1236			local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6)
1237			error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str))
1238		end
1239	end,
1240	__call = function (t, op, a, b, c, d)
1241		code.bc_pc = code.bc_pc + 1
1242		-- Exitting BB straight through, emit compensation code
1243		if Vstate[code.bc_pc] then
1244			if code.reachable then
1245				-- Instruction is reachable from previous line
1246				-- so we must make the variable allocation consistent
1247				-- with the variable allocation at the jump source
1248				-- e.g. 0001 x:R0 = 5
1249				--      0002 if rand() then goto 0005
1250				--      0003 x:R0 -> x:stack
1251				--      0004 y:R0 = 5
1252				--      0005 x:? = 10 <-- x was in R0 before jump, and stack after jump
1253				bb_end(Vstate[code.bc_pc])
1254			else
1255				-- Instruction isn't reachable from previous line, restore variable layout
1256				-- e.g. RET or condition-less JMP on previous line
1257				V = table_copy(Vstate[code.bc_pc])
1258			end
1259		end
1260		-- Perform fixup of jump targets
1261		-- We need to do this because the number of consumed and emitted
1262		-- bytecode instructions is different
1263		local fixup = code.fixup[code.bc_pc]
1264		if fixup ~= nil then
1265			-- Patch JMP source insn with relative offset
1266			for _,pc in ipairs(fixup) do
1267				code.insn[pc].off = code.pc - 1 - pc
1268			end
1269			code.fixup[code.bc_pc] = nil
1270			code.reachable = true
1271		end
1272		-- Execute
1273		if code.reachable then
1274			assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d))
1275			return t[op](a, b, c, d)
1276		end
1277	end,
1278})
1279end
1280
1281-- Emitted code dump
1282local function dump_mem(cls, ins, _, fuse)
1283	-- This is a very dense MEM instruction decoder without much explanation
1284	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
1285	local mode = bit.band(ins.code, 0xe0)
1286	if mode == BPF.XADD then cls = 5 end -- The only mode
1287	local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'}
1288	local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'}
1289	local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)]
1290	local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed
1291	local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off)
1292	local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg
1293	if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end
1294	if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end
1295	if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end
1296	return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src)
1297end
1298
1299local function dump_alu(cls, ins, pc)
1300	local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' }
1301	local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'}
1302	local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns',
1303					'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes',
1304					'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid',
1305					'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop',
1306					'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm',
1307					'perf_event_output', 'skb_load_bytes'}
1308	local op = 0
1309	-- This is a very dense ALU instruction decoder without much explanation
1310	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
1311	for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end
1312	local name = (cls == 5) and jmp[op] or alu[op]
1313	local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm
1314	local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or ''
1315	if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end
1316	return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target)
1317end
1318
1319local function dump_string(code, off, hide_counter)
1320	if not code then return end
1321	local cls_map = {
1322		[0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem,
1323		[4] = dump_alu, [5] = dump_alu, [7] = dump_alu,
1324	}
1325	local result = {}
1326	local fused = false
1327	for i = off or 0, code.pc - 1 do
1328		local ins = code.insn[i]
1329		local cls = bit.band(ins.code, 0x07)
1330		local line = cls_map[cls](cls, ins, i, fused)
1331		if hide_counter then
1332			table.insert(result, line)
1333		else
1334			table.insert(result, string.format('%04u\t%s', i, line))
1335		end
1336		fused = string.find(line, 'LDDW', 1)
1337	end
1338	return table.concat(result, '\n')
1339end
1340
1341local function dump(code)
1342	if not code then return end
1343	print(string.format('-- BPF %s:0-%u', code.insn, code.pc))
1344	print(dump_string(code))
1345end
1346
1347local function compile(prog, params)
1348	-- Create code emitter sandbox, include caller locals
1349	local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi }
1350	-- Include upvalues up to 4 nested scopes back
1351	-- the narrower scope overrides broader scope
1352	for k = 5, 2, -1 do
1353		local i = 1
1354		while true do
1355			local ok, n, v = pcall(debug.getlocal, k, i)
1356			if not ok or not n then break end
1357			env[n] = v
1358			i = i + 1
1359		end
1360	end
1361	setmetatable(env, {
1362		__index = function (_, k)
1363			return proto[k] or builtins[k] or _G[k]
1364		end
1365	})
1366	-- Create code emitter and compile LuaJIT bytecode
1367	if type(prog) == 'string' then prog = loadstring(prog) end
1368	-- Create error handler to print traceback
1369	local funci, pc = bytecode.funcinfo(prog), 0
1370	local E = create_emitter(env, funci.stackslots, funci.params, params or {})
1371	local on_err = function (e)
1372			funci = bytecode.funcinfo(prog, pc)
1373			local from, to = 0, 0
1374			for _ = 1, funci.currentline do
1375				from = to
1376				to = string.find(funci.source, '\n', from+1, true) or 0
1377			end
1378			print(funci.loc..':'..string.sub(funci.source, from+1, to-1))
1379			print('error: '..e)
1380			print(debug.traceback())
1381	end
1382	for _,op,a,b,c,d in bytecode.decoder(prog) do
1383		local ok, _, err = xpcall(E,on_err,op,a,b,c,d)
1384		if not ok then
1385			return nil, err
1386		end
1387	end
1388	return E:compile()
1389end
1390
1391-- BPF map interface
1392local bpf_map_mt = {
1393	__gc = function (map) S.close(map.fd) end,
1394	__len = function(map) return map.max_entries end,
1395	__index = function (map, k)
1396		if type(k) == 'string' then
1397			-- Return iterator
1398			if k == 'pairs' then
1399				return function(t, key)
1400					-- Get next key
1401					local next_key = ffi.new(ffi.typeof(t.key))
1402					local cur_key
1403					if key then
1404						cur_key = t.key
1405						t.key[0] = key
1406					else
1407						cur_key = ffi.new(ffi.typeof(t.key))
1408					end
1409					local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key)
1410					if not ok then return nil, err end
1411					-- Get next value
1412					assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val))
1413					return next_key[0], map.val[0]
1414				end, map, nil
1415			-- Read for perf event map
1416			elseif k == 'reader' then
1417				return function (pmap, pid, cpu, event_type)
1418					-- Caller must either specify PID or CPU
1419					if not pid or pid < 0 then
1420						assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs')
1421						pid = -1
1422					end
1423					-- Create BPF output reader
1424					local pe = S.t.perf_event_attr1()
1425					pe[0].type = 'software'
1426					pe[0].config = 'sw_bpf_output'
1427					pe[0].sample_type = 'raw'
1428					pe[0].sample_period = 1
1429					pe[0].wakeup_events = 1
1430					local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1))
1431					if not reader then return nil, tostring(err) end
1432					-- Register event reader fd in BPF map
1433					assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu))
1434					pmap[cpu] = reader.fd
1435					-- Open memory map and start reading
1436					local ok, err = reader:start()
1437					assert(ok, tostring(err))
1438					ok, err = reader:mmap()
1439					assert(ok, tostring(err))
1440					return cdef.event_reader(reader, event_type)
1441				end
1442			-- Signalise this is a map type
1443			end
1444			return k == '__map'
1445		end
1446		-- Retrieve key
1447		map.key[0] = k
1448		local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val)
1449		if not ok then return nil, err end
1450		return ffi.new(map.val_type, map.val[0])
1451	end,
1452	__newindex = function (map, k, v)
1453		map.key[0] = k
1454		if v == nil then
1455			return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil)
1456		end
1457		map.val[0] = v
1458		return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val)
1459	end,
1460}
1461
1462-- Linux tracing interface
1463local function trace_check_enabled(path)
1464	path = path or '/sys/kernel/debug/tracing'
1465	if S.statfs(path) then return true end
1466	return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?'
1467end
1468
1469-- Tracepoint interface
1470local tracepoint_mt = {
1471	__index = {
1472		bpf = function (t, prog)
1473			if type(prog) ~= 'table' then
1474				-- Create protocol parser with source probe
1475				prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})})
1476			end
1477			-- Load the BPF program
1478			local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc)
1479			assert(prog_fd, tostring(err)..': '..tostring(log))
1480			-- Open tracepoint and attach
1481			t.reader:setbpf(prog_fd:getfd())
1482			table.insert(t.progs, prog_fd)
1483			return prog_fd
1484		end,
1485	}
1486}
1487-- Open tracepoint
1488local function tracepoint_open(path, pid, cpu, group_fd)
1489	-- Open tracepoint and compile tracepoint type
1490	local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path))
1491	local tp_type = assert(cdef.tracepoint_type(path))
1492	-- Open tracepoint reader and create interface
1493	local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd))
1494	return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt)
1495end
1496
1497local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd)
1498	-- Load BPF program
1499	if type(prog) ~= 'table' then
1500		prog = compile(prog, {proto.pt_regs})
1501	end
1502	local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc)
1503	assert(prog_fd, tostring(err)..': '..tostring(log))
1504	-- Open tracepoint and attach
1505	local tp, err = S.perf_probe(ptype, pname, pdef, retprobe)
1506	if not tp then
1507		prog_fd:close()
1508		return nil, tostring(err)
1509	end
1510	local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'})
1511	if not reader then
1512		prog_fd:close()
1513		S.perf_probe(ptype, pname, false)
1514		return nil, tostring(err)
1515	end
1516	local ok, err = reader:setbpf(prog_fd:getfd())
1517	if not ok then
1518		prog_fd:close()
1519		reader:close()
1520		S.perf_probe(ptype, pname, false)
1521		return nil, tostring(err)..' (kernel version should be at least 4.1)'
1522	end
1523	-- Create GC closure for reader to close BPF program
1524	-- and detach probe in correct order
1525	ffi.gc(reader, function ()
1526		prog_fd:close()
1527		reader:close()
1528		S.perf_probe(ptype, pname, false)
1529	end)
1530	return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype}
1531end
1532
1533-- Module interface
1534return setmetatable({
1535	new = create_emitter,
1536	dump = dump,
1537	dump_string = dump_string,
1538	maps = {},
1539	map = function (type, max_entries, key_ctype, val_ctype)
1540		if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
1541		if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
1542		if not max_entries then max_entries = 4096 end
1543		-- Special case for BPF_MAP_STACK_TRACE
1544		if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then
1545			key_ctype = ffi.typeof('int32_t')
1546			val_ctype = ffi.typeof('struct bpf_stacktrace')
1547		end
1548		local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries)
1549		if not fd then return nil, tostring(err) end
1550		local map = setmetatable({
1551			max_entries = max_entries,
1552			key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
1553			val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
1554			map_type = S.c.BPF_MAP[type],
1555			key_type = key_ctype,
1556			val_type = val_ctype,
1557			fd = fd:nogc():getfd(),
1558		}, bpf_map_mt)
1559		return map
1560	end,
1561	socket = function (sock, prog)
1562		-- Expect socket type, if sock is string then assume it's
1563		-- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket
1564		local ok, err
1565		if type(sock) == 'string' then
1566			local iface = assert(S.nl.getlink())[sock]
1567			assert(iface, sock..' is not interface name')
1568			sock, err = S.socket('packet', 'raw')
1569			assert(sock, tostring(err))
1570			ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index}))
1571			assert(ok, tostring(err))
1572		elseif type(sock) == 'number' then
1573			sock = S.t.fd(sock):nogc()
1574		elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore
1575			-- No cast required
1576		else
1577			return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket'
1578		end
1579		-- Load program and attach it to socket
1580		if type(prog) ~= 'table' then
1581			prog = compile(prog, {proto.skb})
1582		end
1583		local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc)
1584		assert(prog_fd, tostring(err)..': '..tostring(log))
1585		assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd()))
1586		return prog_fd, err
1587	end,
1588	tracepoint = function(tp, prog, pid, cpu, group_fd)
1589		assert(trace_check_enabled())
1590		-- Return tracepoint instance if no program specified
1591		-- this allows free specialisation of arg0 to tracepoint type
1592		local probe = tracepoint_open(tp, pid, cpu, group_fd)
1593		-- Load the BPF program
1594		if prog then
1595			probe:bpf(prog)
1596		end
1597		return probe
1598	end,
1599	kprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
1600		assert(trace_check_enabled())
1601		-- Open tracepoint and attach
1602		local pname, pdef = tp:match('([^:]+):(.+)')
1603		return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
1604	end,
1605	uprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
1606		assert(trace_check_enabled())
1607		-- Translate symbol to address
1608		local obj, sym_want = tp:match('([^:]+):(.+)')
1609		if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end
1610		-- Resolve Elf object (no support for anything else)
1611		local elf = require('bpf.elf').open(obj)
1612		local sym = elf:resolve(sym_want)
1613		if not sym then return nil, 'no such symbol' end
1614		sym = sym.st_value - elf:loadaddr()
1615		local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)),
1616		                                         tonumber(ffi.cast('uint32_t', sym)))
1617		-- Convert it to expected uprobe format
1618		local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr)
1619		local pdef = obj..':0x'..sym_addr
1620		return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
1621	end,
1622	tracelog = function(path)
1623		assert(trace_check_enabled())
1624		path = path or '/sys/kernel/debug/tracing/trace_pipe'
1625		return io.open(path, 'r')
1626	end,
1627	ntoh = builtins.ntoh, hton = builtins.hton,
1628}, {
1629	__call = function (_, prog) return compile(prog) end,
1630})
1631