1--[[
2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15]]
16local ffi = require('ffi')
17local BPF = ffi.typeof('struct bpf')
18
19ffi.cdef [[
20struct sk_buff {
21	uint32_t len;
22	uint32_t pkt_type;
23	uint32_t mark;
24	uint32_t queue_mapping;
25	uint32_t protocol;
26	uint32_t vlan_present;
27	uint32_t vlan_tci;
28	uint32_t vlan_proto;
29	uint32_t priority;
30	uint32_t ingress_ifindex;
31	uint32_t ifindex;
32	uint32_t tc_index;
33	uint32_t cb[5];
34	uint32_t hash;
35	uint32_t tc_classid;
36	uint32_t data;
37	uint32_t data_end;
38	uint32_t napi_id;
39
40	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
41	uint32_t family;
42	uint32_t remote_ip4;	/* Stored in network byte order */
43	uint32_t local_ip4;	/* Stored in network byte order */
44	uint32_t remote_ip6[4];	/* Stored in network byte order */
45	uint32_t local_ip6[4];	/* Stored in network byte order */
46	uint32_t remote_port;	/* Stored in network byte order */
47	uint32_t local_port;	/* stored in host byte order */
48	/* ... here. */
49
50	uint32_t data_meta;
51};
52
53struct net_off_t {
54	uint8_t  ver:4;
55} __attribute__((packed));
56
57struct eth_t {
58	uint8_t  dst[6];
59	uint8_t  src[6];
60	uint16_t type;
61} __attribute__((packed));
62
63struct dot1q_t {
64	uint16_t pri:3;
65	uint16_t cfi:1;
66	uint16_t vlanid:12;
67	uint16_t type;
68} __attribute__((packed));
69
70struct arp_t {
71	uint16_t htype;
72	uint16_t ptype;
73	uint8_t  hlen;
74	uint8_t  plen;
75	uint16_t oper;
76	uint8_t  sha[6];
77	uint32_t spa;
78	uint8_t  tha[6];
79	uint32_t tpa;
80} __attribute__((packed));
81
82struct ip_t {
83	uint8_t  ver:4;
84	uint8_t  hlen:4;
85	uint8_t  tos;
86	uint16_t tlen;
87	uint16_t identification;
88	uint16_t ffo_unused:1;
89	uint16_t df:1;
90	uint16_t mf:1;
91	uint16_t foffset:13;
92	uint8_t  ttl;
93	uint8_t  proto;
94	uint16_t hchecksum;
95	uint32_t src;
96	uint32_t dst;
97} __attribute__((packed));
98
99struct icmp_t {
100	uint8_t  type;
101	uint8_t  code;
102	uint16_t checksum;
103} __attribute__((packed));
104
105struct ip6_t {
106	uint32_t ver:4;
107	uint32_t priority:8;
108	uint32_t flow_label:20;
109	uint16_t payload_len;
110	uint8_t  next_header;
111	uint8_t  hop_limit;
112	uint64_t src_hi;
113	uint64_t src_lo;
114	uint64_t dst_hi;
115	uint64_t dst_lo;
116} __attribute__((packed));
117
118struct ip6_opt_t {
119	uint8_t  next_header;
120	uint8_t  ext_len;
121	uint8_t  pad[6];
122} __attribute__((packed));
123
124struct icmp6_t {
125	uint8_t  type;
126	uint8_t  code;
127	uint16_t checksum;
128} __attribute__((packed));
129
130struct udp_t {
131	uint16_t src_port;
132	uint16_t dst_port;
133	uint16_t length;
134	uint16_t crc;
135} __attribute__((packed));
136
137struct tcp_t {
138	uint16_t src_port;
139	uint16_t dst_port;
140	uint32_t seq_num;
141	uint32_t ack_num;
142	uint8_t  offset:4;
143	uint8_t  reserved:4;
144	uint8_t  flag_cwr:1;
145	uint8_t  flag_ece:1;
146	uint8_t  flag_urg:1;
147	uint8_t  flag_ack:1;
148	uint8_t  flag_psh:1;
149	uint8_t  flag_rst:1;
150	uint8_t  flag_syn:1;
151	uint8_t  flag_fin:1;
152	uint16_t rcv_wnd;
153	uint16_t cksum;
154	uint16_t urg_ptr;
155} __attribute__((packed));
156
157struct vxlan_t {
158	uint32_t rsv1:4;
159	uint32_t iflag:1;
160	uint32_t rsv2:3;
161	uint32_t rsv3:24;
162	uint32_t key:24;
163	uint32_t rsv4:8;
164} __attribute__((packed));
165]]
166
167
168-- Architecture-specific ptrace register layout
169local S = require('syscall')
170local arch = S.abi.arch
171local parm_to_reg = {}
172if arch == 'x64' then
173	ffi.cdef [[
174	struct pt_regs {
175		unsigned long r15;
176		unsigned long r14;
177		unsigned long r13;
178		unsigned long r12;
179		unsigned long bp;
180		unsigned long bx;
181		unsigned long r11;
182		unsigned long r10;
183		unsigned long r9;
184		unsigned long r8;
185		unsigned long ax;
186		unsigned long cx;
187		unsigned long dx;
188		unsigned long si;
189		unsigned long di;
190		unsigned long orig_ax;
191		unsigned long ip;
192		unsigned long cs;
193		unsigned long flags;
194		unsigned long sp;
195		unsigned long ss;
196	};]]
197	parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'}
198else
199	ffi.cdef 'struct pt_regs {};'
200end
201-- Map symbolic registers to architecture ABI
202ffi.metatype('struct pt_regs', {
203		__index = function (_ --[[t]],k)
204			return assert(parm_to_reg[k], 'no such register: '..k)
205		end,
206})
207
208local M = {}
209
210-- Dissector interface
211local function dissector(type, e, dst, src, field)
212	local parent = e.V[src].const
213	-- Create new dissector variable
214	e.vcopy(dst, src)
215	-- Compute and materialize new dissector offset from parent
216	e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector}
217	parent.__dissector[field](e, dst)
218	e.V[dst].const.__dissector = type
219end
220M.dissector = dissector
221
222-- Get current effective offset, load field value at an offset relative to it and
223-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen)
224local function next_offset(e, var, type, off, mask, shift)
225	local d = e.V[var].const
226	-- Materialize relative offset value in R0
227	local dst_reg, tmp_reg
228	if d.off then
229		dst_reg = e.vreg(var, 0, true)
230		tmp_reg = dst_reg -- Use target register to avoid copy
231		e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0)
232	else
233		tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset
234		dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var)
235		e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0)
236	end
237	-- Finalize relative offset
238	if mask then
239		e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask)
240	end
241	if shift and shift ~= 0 then
242		local op = BPF.LSH
243		if shift < 0 then
244			op = BPF.RSH
245			shift = -shift
246		end
247		e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift)
248	end
249	-- Add to base offset to turn it into effective address
250	if dst_reg ~= tmp_reg then
251		e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0)
252	else
253		e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off)
254	end
255	-- Discard temporary allocations
256	d.off = nil
257	e.V[e.tmpvar].reg = nil
258end
259
260local function next_skip(e, var, off)
261	local d = e.V[var].const
262	if not d.off then
263		local dst_reg = e.vreg(var)
264		e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off)
265	else
266		d.off = d.off + off
267	end
268end
269
270local function skip_eth(e, dst)
271	-- IP starts right after ETH header (fixed size)
272	local d = e.V[dst].const
273	d.off = d.off + ffi.sizeof('struct eth_t')
274end
275
276-- Export types
277M.type = function(typestr, t)
278	t = t or {}
279	t.__dissector=ffi.typeof(typestr)
280	return t
281end
282M.skb     = M.type('struct sk_buff', {source='ptr_to_ctx'})
283M.pt_regs = M.type('struct pt_regs', {source='ptr_to_probe'})
284M.pkt     = M.type('struct eth_t',   {off=0, source='ptr_to_pkt'}) -- skb needs special accessors
285-- M.eth     = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end
286M.dot1q   = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end
287M.arp     = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end
288M.icmp    = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end
289M.ip      = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end
290M.icmp6   = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end
291M.ip6     = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end
292M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end
293M.udp     = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end
294M.tcp     = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end
295M.vxlan   = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end
296M.data    = function (...) return dissector(ffi.typeof('uint8_t'), ...) end
297M.net_off = function (...) return dissector(ffi.typeof('struct net_off_t'), ...) end
298
299-- Metatables
300ffi.metatype(ffi.typeof('struct eth_t'), {
301	__index = {
302		ip = skip_eth,
303		ip6 = skip_eth,
304		net_off = function (e, dst)
305			next_skip(e, dst, BPF.NET_OFF)
306		end,
307	}
308})
309
310ffi.metatype(ffi.typeof('struct net_off_t'), {
311	__index = {
312		ip = function () end,
313		ip6 = function () end,
314	}
315})
316
317ffi.metatype(ffi.typeof('struct ip_t'), {
318	__index = {
319		-- Skip IP header length (stored as number of words)
320		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
321		-- Mask first nibble and shift by 2 (multiplication by 4)
322		icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
323		udp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
324		tcp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
325	}
326})
327
328ffi.metatype(ffi.typeof('struct ip6_t'), {
329	__index = {
330		-- Skip fixed IPv6 header length (40 bytes)
331		-- The caller must check the value of `next_header` to skip any extension headers
332		icmp6 = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
333		udp  = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
334		tcp  = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
335		ip6_opt = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
336	}
337})
338
339local ip6_opt_ext_len_off = ffi.offsetof('struct ip6_opt_t', 'ext_len')
340ffi.metatype(ffi.typeof('struct ip6_opt_t'), {
341	__index = {
342		-- Skip IPv6 extension header length (field `ext_len`)
343		icmp6 = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
344		udp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
345		tcp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
346		ip6_opt = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
347	}
348})
349
350ffi.metatype(ffi.typeof('struct tcp_t'), {
351	__index = {
352		-- Skip TCP header length (stored as number of words)
353		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
354		data = function(e, dst)
355			next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2)
356		end,
357	}
358})
359
360ffi.metatype(ffi.typeof('struct udp_t'), {
361	__index = {
362		-- Skip UDP header length (8 octets)
363		data = function(e, dst)
364			next_skip(e, dst, ffi.sizeof('struct udp_t'))
365		end,
366	}
367})
368
369-- Constants
370M.c = {
371	eth = { -- Constants http://standards.ieee.org/regauth/ethertype
372		ip     = 0x0800, -- IP (v4) protocol
373		ip6    = 0x86dd, -- IP (v6) protocol
374		arp    = 0x0806, -- Address resolution protocol
375		revarp = 0x8035, -- Reverse addr resolution protocol
376		vlan   = 0x8100, -- IEEE 802.1Q VLAN tagging
377	},
378	ip = {
379		-- Reserved Addresses
380		addr_any         = 0x00000000, -- 0.0.0.0
381		addr_broadcast   = 0xffffffff, -- 255.255.255.255
382		addr_loopback    = 0x7f000001, -- 127.0.0.1
383		addr_mcast_all   = 0xe0000001, -- 224.0.0.1
384		addr_mcast_local = 0xe00000ff, -- 224.0.0.255
385		-- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474")
386		tos_default      = 0x00, -- default
387		tos_lowdelay     = 0x10, -- low delay
388		tos_throughput   = 0x08, -- high throughput
389		tos_reliability  = 0x04, -- high reliability
390		tos_lowcost      = 0x02, -- low monetary cost - XXX
391		tos_ect          = 0x02, -- ECN-capable transport
392		tos_ce           = 0x01, -- congestion experienced
393		-- Fragmentation flags (ip_off)
394		rf = 0x8000, -- reserved
395		df = 0x4000, -- don't fragment
396		mf = 0x2000, -- more fragments (not last frag)
397		offmask  = 0x1fff, -- mask for fragment offset
398		-- Time-to-live (ip_ttl), seconds
399		ttl_default = 64,  -- default ttl, RFC 1122, RFC 1340
400		ttl_max     = 255, -- maximum ttl
401		-- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers
402		proto_ip      = 0,  -- dummy for IP
403		proto_hopopts = 0,  -- IPv6 hop-by-hop options
404		proto_icmp    = 1,  -- ICMP
405		proto_igmp    = 2,  -- IGMP
406		proto_ggp     = 3,  -- gateway-gateway protocol
407		proto_ipip    = 4,  -- IP in IP
408		proto_st      = 5,  -- ST datagram mode
409		proto_tcp     = 6,  -- TCP
410		proto_cbt     = 7,  -- CBT
411		proto_egp     = 8,  -- exterior gateway protocol
412		proto_igp     = 9,  -- interior gateway protocol
413		proto_bbnrcc  = 10,  -- BBN RCC monitoring
414		proto_nvp     = 11,  -- Network Voice Protocol
415		proto_pup     = 12,  -- PARC universal packet
416		proto_argus   = 13,  -- ARGUS
417		proto_emcon   = 14,  -- EMCON
418		proto_xnet    = 15,  -- Cross Net Debugger
419		proto_chaos   = 16,  -- Chaos
420		proto_udp     = 17,  -- UDP
421		proto_mux     = 18,  -- multiplexing
422		proto_dcnmeas = 19,  -- DCN measurement
423		proto_hmp     = 20,  -- Host Monitoring Protocol
424		proto_prm     = 21,  -- Packet Radio Measurement
425		proto_idp     = 22,  -- Xerox NS IDP
426		proto_trunk1  = 23,  -- Trunk-1
427		proto_trunk2  = 24,  -- Trunk-2
428		proto_leaf1   = 25,  -- Leaf-1
429		proto_leaf2   = 26,  -- Leaf-2
430		proto_rdp     = 27,  -- "Reliable Datagram" proto
431		proto_irtp    = 28,  -- Inet Reliable Transaction
432		proto_tp      = 29,  -- ISO TP class 4
433		proto_netblt  = 30,  -- Bulk Data Transfer
434		proto_mfpnsp  = 31,  -- MFE Network Services
435		proto_meritinp= 32,  -- Merit Internodal Protocol
436		proto_sep     = 33,  -- Sequential Exchange proto
437		proto_3pc     = 34,  -- Third Party Connect proto
438		proto_idpr    = 35,  -- Interdomain Policy Route
439		proto_xtp     = 36,  -- Xpress Transfer Protocol
440		proto_ddp     = 37,  -- Datagram Delivery Proto
441		proto_cmtp    = 38,  -- IDPR Ctrl Message Trans
442		proto_tppp    = 39,  -- TP++ Transport Protocol
443		proto_il      = 40,  -- IL Transport Protocol
444		proto_ip6     = 41,  -- IPv6
445		proto_sdrp    = 42,  -- Source Demand Routing
446		proto_routing = 43,  -- IPv6 routing header
447		proto_fragment= 44,  -- IPv6 fragmentation header
448		proto_rsvp    = 46,  -- Reservation protocol
449		proto_gre     = 47,  -- General Routing Encap
450		proto_mhrp    = 48,  -- Mobile Host Routing
451		proto_ena     = 49,  -- ENA
452		proto_esp     = 50,  -- Encap Security Payload
453		proto_ah      = 51,  -- Authentication Header
454		proto_inlsp   = 52,  -- Integated Net Layer Sec
455		proto_swipe   = 53,  -- SWIPE
456		proto_narp    = 54,  -- NBMA Address Resolution
457		proto_mobile  = 55,  -- Mobile IP, RFC 2004
458		proto_tlsp    = 56,  -- Transport Layer Security
459		proto_skip    = 57,  -- SKIP
460		proto_icmp6   = 58,  -- ICMP for IPv6
461		proto_none    = 59,  -- IPv6 no next header
462		proto_dstopts = 60,  -- IPv6 destination options
463		proto_anyhost = 61,  -- any host internal proto
464		proto_cftp    = 62,  -- CFTP
465		proto_anynet  = 63,  -- any local network
466		proto_expak   = 64,  -- SATNET and Backroom EXPAK
467		proto_kryptolan = 65,  -- Kryptolan
468		proto_rvd     = 66,  -- MIT Remote Virtual Disk
469		proto_ippc    = 67,  -- Inet Pluribus Packet Core
470		proto_distfs  = 68,  -- any distributed fs
471		proto_satmon  = 69,  -- SATNET Monitoring
472		proto_visa    = 70,  -- VISA Protocol
473		proto_ipcv    = 71,  -- Inet Packet Core Utility
474		proto_cpnx    = 72,  -- Comp Proto Net Executive
475		proto_cphb    = 73,  -- Comp Protocol Heart Beat
476		proto_wsn     = 74,  -- Wang Span Network
477		proto_pvp     = 75,  -- Packet Video Protocol
478		proto_brsatmon= 76,  -- Backroom SATNET Monitor
479		proto_sunnd   = 77,  -- SUN ND Protocol
480		proto_wbmon   = 78,  -- WIDEBAND Monitoring
481		proto_wbexpak = 79,  -- WIDEBAND EXPAK
482		proto_eon     = 80,  -- ISO CNLP
483		proto_vmtp    = 81,  -- Versatile Msg Transport
484		proto_svmtp   = 82,  -- Secure VMTP
485		proto_vines   = 83,  -- VINES
486		proto_ttp     = 84,  -- TTP
487		proto_nsfigp  = 85,  -- NSFNET-IGP
488		proto_dgp     = 86,  -- Dissimilar Gateway Proto
489		proto_tcf     = 87,  -- TCF
490		proto_eigrp   = 88,  -- EIGRP
491		proto_ospf    = 89,  -- Open Shortest Path First
492		proto_spriterpc= 90,  -- Sprite RPC Protocol
493		proto_larp    = 91,  -- Locus Address Resolution
494		proto_mtp     = 92,  -- Multicast Transport Proto
495		proto_ax25    = 93,  -- AX.25 Frames
496		proto_ipipencap= 94,  -- yet-another IP encap
497		proto_micp    = 95,  -- Mobile Internet Ctrl
498		proto_sccsp   = 96,  -- Semaphore Comm Sec Proto
499		proto_etherip = 97,  -- Ethernet in IPv4
500		proto_encap   = 98,  -- encapsulation header
501		proto_anyenc  = 99,  -- private encryption scheme
502		proto_gmtp    = 100,  -- GMTP
503		proto_ifmp    = 101,  -- Ipsilon Flow Mgmt Proto
504		proto_pnni    = 102,  -- PNNI over IP
505		proto_pim     = 103,  -- Protocol Indep Multicast
506		proto_aris    = 104,  -- ARIS
507		proto_scps    = 105,  -- SCPS
508		proto_qnx     = 106,  -- QNX
509		proto_an      = 107,  -- Active Networks
510		proto_ipcomp  = 108,  -- IP Payload Compression
511		proto_snp     = 109,  -- Sitara Networks Protocol
512		proto_compaqpeer= 110,  -- Compaq Peer Protocol
513		proto_ipxip   = 111,  -- IPX in IP
514		proto_vrrp    = 112,  -- Virtual Router Redundancy
515		proto_pgm     = 113,  -- PGM Reliable Transport
516		proto_any0hop = 114,  -- 0-hop protocol
517		proto_l2tp    = 115,  -- Layer 2 Tunneling Proto
518		proto_ddx     = 116,  -- D-II Data Exchange (DDX)
519		proto_iatp    = 117,  -- Interactive Agent Xfer
520		proto_stp     = 118,  -- Schedule Transfer Proto
521		proto_srp     = 119,  -- SpectraLink Radio Proto
522		proto_uti     = 120,  -- UTI
523		proto_smp     = 121,  -- Simple Message Protocol
524		proto_sm      = 122,  -- SM
525		proto_ptp     = 123,  -- Performance Transparency
526		proto_isis    = 124,  -- ISIS over IPv4
527		proto_fire    = 125,  -- FIRE
528		proto_crtp    = 126,  -- Combat Radio Transport
529		proto_crudp   = 127,  -- Combat Radio UDP
530		proto_sscopmce= 128,  -- SSCOPMCE
531		proto_iplt    = 129,  -- IPLT
532		proto_sps     = 130,  -- Secure Packet Shield
533		proto_pipe    = 131,  -- Private IP Encap in IP
534		proto_sctp    = 132,  -- Stream Ctrl Transmission
535		proto_fc      = 133,  -- Fibre Channel
536		proto_rsvpign = 134,  -- RSVP-E2E-IGNORE
537		proto_raw     = 255,  -- Raw IP packets
538		proto_reserved= 255,  -- Reserved
539	},
540}
541
542return M