1--[[
2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15]]
16local ffi = require('ffi')
17local bit = require('bit')
18local has_syscall, S = pcall(require, 'syscall')
19local M = {}
20
21ffi.cdef [[
22struct bpf {
23	/* Instruction classes */
24	static const int LD   = 0x00;
25	static const int LDX  = 0x01;
26	static const int ST   = 0x02;
27	static const int STX  = 0x03;
28	static const int ALU  = 0x04;
29	static const int JMP  = 0x05;
30	static const int ALU64 = 0x07;
31	/* ld/ldx fields */
32	static const int W    = 0x00;
33	static const int H    = 0x08;
34	static const int B    = 0x10;
35	static const int ABS  = 0x20;
36	static const int IND  = 0x40;
37	static const int MEM  = 0x60;
38	static const int LEN  = 0x80;
39	static const int MSH  = 0xa0;
40	/* alu/jmp fields */
41	static const int ADD  = 0x00;
42	static const int SUB  = 0x10;
43	static const int MUL  = 0x20;
44	static const int DIV  = 0x30;
45	static const int OR   = 0x40;
46	static const int AND  = 0x50;
47	static const int LSH  = 0x60;
48	static const int RSH  = 0x70;
49	static const int NEG  = 0x80;
50	static const int MOD  = 0x90;
51	static const int XOR  = 0xa0;
52	static const int JA   = 0x00;
53	static const int JEQ  = 0x10;
54	static const int JGT  = 0x20;
55	static const int JGE  = 0x30;
56	static const int JSET = 0x40;
57	static const int K    = 0x00;
58	static const int X    = 0x08;
59	static const int JNE  = 0x50;	/* jump != */
60	static const int JSGT = 0x60;	/* SGT is signed '>', GT in x86 */
61	static const int JSGE = 0x70;	/* SGE is signed '>=', GE in x86 */
62	static const int CALL = 0x80;	/* function call */
63	static const int EXIT = 0x90;	/* function return */
64	/* ld/ldx fields */
65	static const int DW    = 0x18;	/* double word */
66	static const int XADD  = 0xc0;	/* exclusive add */
67	/* alu/jmp fields */
68	static const int MOV   = 0xb0;	/* mov reg to reg */
69	static const int ARSH  = 0xc0;	/* sign extending arithmetic shift right */
70	/* change endianness of a register */
71	static const int END   = 0xd0;	/* flags for endianness conversion: */
72	static const int TO_LE = 0x00;	/* convert to little-endian */
73	static const int TO_BE = 0x08;	/* convert to big-endian */
74	/* misc */
75	static const int PSEUDO_MAP_FD = 0x01;
76	/* helper functions */
77	static const int F_CURRENT_CPU    = 0xffffffff;
78	static const int F_USER_STACK     = 1 << 8;
79	static const int F_FAST_STACK_CMP = 1 << 9;
80	static const int F_REUSE_STACKID  = 1 << 10;
81	/* special offsets for ancillary data */
82	static const int NET_OFF          = -0x100000;
83	static const int LL_OFF           = -0x200000;
84};
85/* eBPF commands */
86struct bpf_cmd {
87	static const int MAP_CREATE       = 0;
88	static const int MAP_LOOKUP_ELEM  = 1;
89	static const int MAP_UPDATE_ELEM  = 2;
90	static const int MAP_DELETE_ELEM  = 3;
91	static const int MAP_GET_NEXT_KEY = 4;
92	static const int PROG_LOAD        = 5;
93	static const int OBJ_PIN          = 6;
94	static const int OBJ_GET          = 7;
95};
96/* eBPF helpers */
97struct bpf_func_id {
98	static const int unspec               = 0;
99	static const int map_lookup_elem      = 1;
100	static const int map_update_elem      = 2;
101	static const int map_delete_elem      = 3;
102	static const int probe_read           = 4;
103	static const int ktime_get_ns         = 5;
104	static const int trace_printk         = 6;
105	static const int get_prandom_u32      = 7;
106	static const int get_smp_processor_id = 8;
107	static const int skb_store_bytes      = 9;
108	static const int l3_csum_replace      = 10;
109	static const int l4_csum_replace      = 11;
110	static const int tail_call            = 12;
111	static const int clone_redirect       = 13;
112	static const int get_current_pid_tgid = 14;
113	static const int get_current_uid_gid  = 15;
114	static const int get_current_comm     = 16;
115	static const int get_cgroup_classid   = 17;
116	static const int skb_vlan_push        = 18;
117	static const int skb_vlan_pop         = 19;
118	static const int skb_get_tunnel_key   = 20;
119	static const int skb_set_tunnel_key   = 21;
120	static const int perf_event_read      = 22;
121	static const int redirect             = 23;
122	static const int get_route_realm      = 24;
123	static const int perf_event_output    = 25;
124	static const int skb_load_bytes       = 26;
125	static const int get_stackid          = 27;
126};
127/* BPF_MAP_STACK_TRACE structures and constants */
128static const int BPF_MAX_STACK_DEPTH = 127;
129struct bpf_stacktrace {
130	uint64_t ip[BPF_MAX_STACK_DEPTH];
131};
132]]
133
134-- Compatibility: ljsyscall doesn't have support for BPF syscall
135if not has_syscall or not S.bpf then
136	error("ljsyscall doesn't support bpf(), must be updated")
137else
138	local strflag = require('syscall.helpers').strflag
139	-- Compatibility: ljsyscall<=0.12
140	if not S.c.BPF_MAP.LRU_HASH then
141		S.c.BPF_MAP = strflag {
142			UNSPEC           = 0,
143			HASH             = 1,
144			ARRAY            = 2,
145			PROG_ARRAY       = 3,
146			PERF_EVENT_ARRAY = 4,
147			PERCPU_HASH      = 5,
148			PERCPU_ARRAY     = 6,
149			STACK_TRACE      = 7,
150			CGROUP_ARRAY     = 8,
151			LRU_HASH         = 9,
152			LRU_PERCPU_HASH  = 10,
153			LPM_TRIE         = 11,
154			ARRAY_OF_MAPS    = 12,
155			HASH_OF_MAPS     = 13,
156			DEVMAP           = 14,
157			SOCKMAP          = 15,
158			CPUMAP           = 16,
159		}
160	end
161	if not S.c.BPF_PROG.TRACEPOINT then
162		S.c.BPF_PROG = strflag {
163			UNSPEC           = 0,
164			SOCKET_FILTER    = 1,
165			KPROBE           = 2,
166			SCHED_CLS        = 3,
167			SCHED_ACT        = 4,
168			TRACEPOINT       = 5,
169			XDP              = 6,
170			PERF_EVENT       = 7,
171			CGROUP_SKB       = 8,
172			CGROUP_SOCK      = 9,
173			LWT_IN           = 10,
174			LWT_OUT          = 11,
175			LWT_XMIT         = 12,
176			SOCK_OPS         = 13,
177			SK_SKB           = 14,
178			CGROUP_DEVICE    = 15,
179			SK_MSG           = 16,
180			RAW_TRACEPOINT   = 17,
181			CGROUP_SOCK_ADDR = 18,
182		}
183	end
184end
185
186-- Compatibility: metatype for stacktrace
187local function stacktrace_iter(t, i)
188	i = i + 1
189	if i < #t and t.ip[i] > 0 then
190		return i, t.ip[i]
191	end
192end
193ffi.metatype('struct bpf_stacktrace', {
194	__len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end,
195	__ipairs = function (t) return stacktrace_iter, t, -1 end,
196})
197
198-- Reflect cdata type
199function M.typename(v)
200	if not v or type(v) ~= 'cdata' then return nil end
201	return string.match(tostring(ffi.typeof(v)), '<([^>]+)')
202end
203
204-- Reflect if cdata type can be pointer (accepts array or pointer)
205function M.isptr(v, noarray)
206	local ctname = M.typename(v)
207	if ctname then
208		ctname = string.sub(ctname, -1)
209		ctname = ctname == '*' or (not noarray and ctname == ']')
210	end
211	return ctname
212end
213
214-- Return true if variable is a non-nil constant that can be used as immediate value
215-- e.g. result of KSHORT and KNUM
216function M.isimmconst(v)
217	return (type(v.const) == 'number' and not ffi.istype(v.type, ffi.typeof('void')))
218		or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('uint64_t')) -- Lua numbers are at most 52 bits
219		or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('int64_t'))
220end
221
222function M.osversion()
223	-- We have no better way to extract current kernel hex-string other
224	-- than parsing headers, compiling a helper function or reading /proc
225	local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2
226	if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease
227		ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+')
228	end
229	local version = 0
230	for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ
231		version = bit.bor(version, bit.lshift(tonumber(i), 8*count))
232		count = count - 1
233	end
234	return version
235end
236
237function M.event_reader(reader, event_type)
238	-- Caller can specify event message binary format
239	if event_type then
240		assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader')
241		event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type
242	end
243	-- Wrap reader in interface that can interpret read event messages
244	return setmetatable({reader=reader,type=event_type}, {__index = {
245		block = function(_ --[[self]])
246			return S.select { readfds = {reader.fd} }
247		end,
248		next = function(_ --[[self]], k)
249			local len, ev = reader:next(k)
250			-- Filter out only sample frames
251			while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do
252				len, ev = reader:next(len)
253			end
254			if ev and event_type then
255				-- The perf event reader returns framed data with header and variable length
256				-- This is going skip the frame header and cast data to given type
257				ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t'))
258			end
259			return len, ev
260		end,
261		read = function(self)
262			return self.next, self, nil
263		end,
264	}})
265end
266
267function M.tracepoint_type(tp)
268	-- Read tracepoint format string
269	local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r'))
270	local fmt = fp:read '*a'
271	fp:close()
272	-- Parse struct fields
273	local fields = {}
274	for f in fmt:gmatch 'field:([^;]+;)' do
275		table.insert(fields, f)
276	end
277	return string.format('struct { %s }', table.concat(fields))
278end
279
280return M
281