Need help in XDP program failing to load with error "R7 offset is outside of the packet"

I have written a XDP program that looks at the incoming SCTP packets.

A SCTP packets can have multiple chunks into it. I am specifically interested in DATA chunks that contain the application layer payload. Now, the thing is there can many DATA chunks into a single SCTP message. Below is a wireshark capture and represents what I am dealing with.

Snap showing multiple chunks in one SCTP packet

I need to look at every DATA chunk and run some logic. As for my usecase, I have noticed that the number of total chunks bundled into one SCTP packet never exceeds 32.

So, accordingly I have used a loop in XDP program hard coded to 32. The program compiles fine. But when I try to load the XDP program into the kernel in Native mode, it says "R7 offset is outside of the packet". So, I tried reducing the number of iterations to 16 and again the programs compiles fine but this time it successfully loads as well.

Tried going back again to 32 (just this change and nothing else) and it fails. Can't really figure out why. Any help in this regard will be appreciated.

I should also mention that logic I need to process every chunk is bit lengthy. Maybe that is the cause for the issue?

Following is the code:

#define AMF_CPUS 4
#define INV_RET_U32 4294967295
#define INV_RET_U16 65535
#define INV_RET_U8 255
#define DATA_CHUNK 0

/* Header cursor to keep track of current parsing position */
struct hdr_cursor {
  void *pos;
};

// Return the chunk type of the corresponding sctp chunk
static __always_inline __u8 parse_sctp_chunk_type(void *data, void *data_end) {
  if (data + 1 > data_end)
    return INV_RET_U8;
  return *(__u8 *)data;
}

// Return the chunk size of the corresponding sctp chunk
static __always_inline __u16 parse_sctp_chunk_size(void *data, void *data_end) {
  if (data + 4 > data_end)
    return INV_RET_U16;
  __u16 size = bpf_ntohs(*(__u16 *)(data + 2));
  return size;
}

static __always_inline __u32 parse_sctp_hdr(struct hdr_cursor *nh,
                                            void *data_end) {
  struct sctphdr *sctph = nh->pos;
  int hdrsize = sizeof(*sctph);

  if (sctph + 1 > data_end)
    return INV_RET_U32;

  nh->pos += hdrsize;

#pragma clang loop unroll(full)
  for (int i = 0; i < 16; ++i) {
    __u8 type = parse_sctp_chunk_type(nh->pos, data_end);
    if (type == INV_RET_U8)
      return INV_RET_U32;

    __u16 size = parse_sctp_chunk_size(nh->pos, data_end);
    if (size > 512)
      return INV_RET_U32;

    //Adjust for padding
    size += (size % 4) == 0 ? 0 : 4 - size % 4;

    if (type == DATA_CHUNK) {
        // Run logic
    }

    if (nh->pos + size < data_end)
      nh->pos += size;
    else
      return INV_RET_U32;
  }
  return INV_RET_U32;
}

SEC("parse_sctp")
int xdp_parse_sctp(struct xdp_md *ctx) {
  void *data_end = (void *)(long)ctx->data_end;
  void *data = (void *)(long)ctx->data;

  /* These keep track of the next header type and iterator pointer */
  struct hdr_cursor nh;
  __u32 nh_type, ip_type;

  /* Start next header cursor position at data start */
  nh.pos = data;
  nh_type = parse_ethhdr(&nh, data_end);

  if (bpf_ntohs(nh_type) != ETH_P_IP)
    return XDP_PASS;

  ip_type = parse_iphdr(&nh, data_end);

  if (ip_type != IPPROTO_SCTP)
    return XDP_PASS;

  parse_sctp_hdr(&nh, data_end);
  return XDP_PASS;
}

char _license[] SEC("license") = "GPL";

Here are a few last lines of the dump of the log that eBPF verifier spits out while loading the above program:

Full log is available here.

; if (nh->pos + size < data_end)
2937: (57) r2 &= 65535
2938: (bf) r7 = r8
2939: (0f) r7 += r2
last_idx 2939 first_idx 2931
regs=4 stack=0 before 2938: (bf) r7 = r8
regs=4 stack=0 before 2937: (57) r2 &= 65535
regs=4 stack=0 before 2936: (0f) r2 += r1
regs=6 stack=0 before 2935: (79) r2 = *(u64 *)(r10 -40)
regs=2 stack=10 before 2934: (79) r1 = *(u64 *)(r10 -32)
regs=0 stack=18 before 2933: (6b) *(u16 *)(r8 +8) = r1
regs=0 stack=18 before 2932: (dc) r1 = be16 r1
regs=0 stack=18 before 2931: (57) r1 &= 3
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_r=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=pkt(id=65,off=37,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R6=pkt_end(id=0,off=0,imm=0) R7=pkt(id=65,off=27,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32_r=mmmmmmmm fp-40_r=invP fp-48=pkt
parent didn't have regs=0 stack=10 marks
last_idx 2835 first_idx 2828
regs=0 stack=10 before 2835: (05) goto pc+95
regs=0 stack=10 before 2834: (2d) if r2 > r6 goto pc+99
regs=0 stack=10 before 2833: (07) r2 += 11
regs=0 stack=10 before 2832: (bf) r2 = r8
regs=0 stack=10 before 2831: (73) *(u8 *)(r7 +0) = r2
regs=0 stack=10 before 2830: (47) r2 |= 4
regs=0 stack=10 before 2829: (71) r2 = *(u8 *)(r7 +0)
regs=0 stack=10 before 2828: (3d) if r2 >= r6 goto pc+3
 R0_w=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_rw=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_rw=pkt(id=65,off=28,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R6_r=pkt_end(id=0,off=0,imm=0) R7_r=pkt(id=65,off=27,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32_r=mmmmmmmm fp-40_r=invP fp-48=pkt
parent didn't have regs=0 stack=10 marks
last_idx 2827 first_idx 2817
regs=0 stack=10 before 2827: (79) r2 = *(u64 *)(r10 -24)
regs=0 stack=10 before 2826: (61) r1 = *(u32 *)(r10 -4)
regs=0 stack=10 before 2825: (15) if r1 == 0x0 goto pc+35
regs=0 stack=10 before 2824: (69) r1 = *(u16 *)(r0 +0)
regs=0 stack=10 before 2823: (15) if r0 == 0x0 goto pc+37
regs=0 stack=10 before 2822: (85) call bpf_map_lookup_elem#1
regs=0 stack=10 before 2820: (18) r1 = 0xffffb27245d41000
regs=0 stack=10 before 2819: (07) r2 += -4
regs=0 stack=10 before 2818: (bf) r2 = r10
regs=0 stack=10 before 2817: (15) if r2 == 0xffff goto pc-2758
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R2_rw=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R3=inv(id=0,umin_value=1,umax_value=4,var_off=(0x0; 0x7)) R6_r=pkt_end(id=0,off=0,imm=0) R7_r=pkt(id=65,off=27,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8_r=mmmm???? fp-24_r=pkt fp-32_r=mmmmmmmm fp-40_r=invP fp-48=pkt
parent didn't have regs=0 stack=10 marks
last_idx 2816 first_idx 2808
regs=0 stack=10 before 2816: (63) *(u32 *)(r10 -4) = r2
regs=0 stack=10 before 2815: (57) r2 &= 65535
regs=0 stack=10 before 2814: (4f) r2 |= r1
regs=0 stack=10 before 2813: (67) r2 <<= 8
regs=0 stack=10 before 2812: (71) r2 = *(u8 *)(r2 +27)
regs=0 stack=10 before 2811: (71) r1 = *(u8 *)(r2 +28)
regs=0 stack=10 before 2810: (55) if r1 != 0xa00 goto pc-2753
regs=0 stack=10 before 2809: (69) r1 = *(u16 *)(r2 +23)
regs=0 stack=10 before 2808: (2d) if r1 > r6 goto pc-2751
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_rw=pkt(id=65,off=55,r=44,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R2_rw=pkt(id=65,off=26,r=44,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R3=inv(id=0,umin_value=1,umax_value=4,var_off=(0x0; 0x7)) R6_r=pkt_end(id=0,off=0,imm=0) R7_r=pkt(id=65,off=27,r=44,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=44,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8_r=mmmm???? fp-24_rw=pkt fp-32_r=mmmmmmmm fp-40_r=invP fp-48_w=pkt
parent didn't have regs=0 stack=10 marks
last_idx 2807 first_idx 2798
regs=0 stack=10 before 2807: (7b) *(u64 *)(r10 -48) = r1
regs=0 stack=10 before 2806: (07) r1 += 29
regs=0 stack=10 before 2805: (bf) r1 = r2
regs=0 stack=10 before 2804: (bf) r2 = r8
regs=0 stack=10 before 2803: (15) if r9 == 0xf goto pc+32
regs=0 stack=10 before 2802: (71) r9 = *(u8 *)(r2 +17)
regs=0 stack=10 before 2801: (7b) *(u64 *)(r10 -24) = r1
regs=0 stack=10 before 2800: (07) r1 += 2
regs=0 stack=10 before 2799: (bf) r1 = r2
regs=0 stack=10 before 2798: (2d) if r1 > r6 goto pc-2739
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_rw=pkt(id=65,off=44,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R2_rw=pkt(id=65,off=26,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R3_w=inv(id=0,umin_value=1,umax_value=4,var_off=(0x0; 0x7)) R6_r=pkt_end(id=0,off=0,imm=0) R7_r=pkt(id=65,off=27,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=invP(id=0,umax_value=516,var_off=(0x0; 0xffff),s32_max_value=65535,u32_max_value=65535) R10=fp0 fp-8_r=mmmm???? fp-24=pkt fp-32_r=mmmmmmmm fp-40_rw=invP fp-48=pkt
parent didn't have regs=0 stack=10 marks
last_idx 2797 first_idx 2790
regs=0 stack=10 before 2797: (07) r1 += 18
regs=0 stack=10 before 2796: (bf) r1 = r2
regs=0 stack=10 before 2795: (bf) r2 = r8
regs=0 stack=10 before 2794: (55) if r1 != 0x0 goto pc+139
regs=0 stack=10 before 2793: (7b) *(u64 *)(r10 -40) = r3
regs=8 stack=0 before 2792: (1f) r3 -= r2
regs=c stack=0 before 2791: (b7) r3 = 4
regs=4 stack=0 before 2790: (15) if r2 == 0x0 goto pc+3
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_rw=inv(id=66,umax_value=255,var_off=(0x0; 0xff)) R2_rw=invP(id=0,umax_value=3,var_off=(0x0; 0x3)) R3_w=invP0 R6_r=pkt_end(id=0,off=0,imm=0) R7_r=pkt(id=65,off=27,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_r=pkt(id=65,off=26,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=invP(id=0,umax_value=516,var_off=(0x0; 0xffff),s32_max_value=65535,u32_max_value=65535) R10=fp0 fp-8_r=mmmm???? fp-24=pkt fp-32_rw=mmmmmmmm fp-40_w=00000000 fp-48=pkt
parent didn't have regs=4 stack=0 marks
last_idx 2789 first_idx 2778
regs=4 stack=0 before 2789: (7b) *(u64 *)(r10 -40) = r3
regs=4 stack=0 before 2788: (b7) r3 = 0
regs=4 stack=0 before 2787: (57) r2 &= 3
regs=4 stack=0 before 2786: (79) r2 = *(u64 *)(r10 -32)
regs=0 stack=8 before 2785: (25) if r2 > 0x200 goto pc-2726
regs=0 stack=8 before 2784: (7b) *(u64 *)(r10 -32) = r2
regs=4 stack=0 before 2783: (dc) r2 = be16 r2
regs=5 stack=0 before 2782: (69) r2 = *(u16 *)(r8 +2)
regs=1 stack=0 before 2781: (15) if r2 == 0xff goto pc-2722
regs=1 stack=0 before 2780: (bf) r2 = r1
regs=1 stack=0 before 2779: (71) r1 = *(u8 *)(r8 +0)
regs=1 stack=0 before 2778: (2d) if r1 > r6 goto pc-2719
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_rw=pkt(id=65,off=30,r=27,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R2_w=pkt(id=61,off=26,r=55,umin_value=20,umax_value=7800,var_off=(0x0; 0x7fffffff),s32_min_value=0,u32_max_value=2147483647) R6_r=pkt_end(id=0,off=0,imm=0) R7_rw=pkt(id=65,off=27,r=27,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8_rw=pkt(id=65,off=26,r=27,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9_w=invP(id=0,umax_value=516,var_off=(0x0; 0xffff),s32_max_value=65535,u32_max_value=65535) R10=fp0 fp-8_r=mmmm???? fp-24=pkt fp-32=pkt fp-40=inv fp-48=pkt
parent already had regs=0 stack=0 marks
; if (nh->pos + size < data_end)
2940: (3d) if r7 >= r6 goto pc-2881
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_w=inv(id=0) R2_w=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=pkt_end(id=0,off=0,imm=0) R7_w=pkt(id=68,off=26,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32=mmmmmmmm fp-40=inv fp-48=pkt
2941: (bf) r1 = r7
2942: (07) r1 += 1
2943: (7b) *(u64 *)(r10 -24) = r1
2944: (2d) if r1 > r6 goto pc-2885
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_w=pkt(id=68,off=27,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R2_w=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=pkt_end(id=0,off=0,imm=0) R7_w=pkt(id=68,off=26,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24_w=pkt fp-32=mmmmmmmm fp-40=inv fp-48=pkt
2945: (bf) r1 = r7
2946: (07) r1 += 4
; if (type == INV_RET_U8)
2947: (2d) if r1 > r6 goto pc-2888
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1=pkt(id=68,off=30,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R2=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=pkt_end(id=0,off=0,imm=0) R7=pkt(id=68,off=26,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32=mmmmmmmm fp-40=inv fp-48=pkt
2948: (71) r1 = *(u8 *)(r7 +0)
invalid access to packet, off=26 size=1, R7(id=68,off=26,r=0)
R7 offset is outside of the packet
processed 1225 insns (limit 1000000) max_states_per_insn 0 total_states 107 peak_states 107 mark_read 7

libbpf: -- END LOG --
libbpf: failed to load program 'xdp_cpu_redirect_stream'
libbpf: failed to load object 'xdp_stream_nat_new.o'
ERR: loading BPF-OBJ file(xdp_stream_nat_new.o) (-4007): Unknown error 4007
ERR: loading file: xdp_stream_nat_new.o

Solution 1:

TL;DR. You are hitting a corner-case limitation of the verifier. Changing the end of the for loop to the following may help.

#define MAX_PACKET_OFF 0xffff
...
nh->pos += size;
if (nh->pos > MAX_PACKET_OFF)
     return INV_RET_U32;
if (nh->pos >= data_end)
    return INV_RET_U32;

The full explanation is a bit long, see below.


Verifier error explanation

2945: (bf) r1 = r7
2946: (07) r1 += 4
2947: (2d) if r1 > r6 goto pc-2888
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1=pkt(id=68,off=30,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R2=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=pkt_end(id=0,off=0,imm=0) R7=pkt(id=68,off=26,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32=mmmmmmmm fp-40=inv fp-48=pkt
2948: (71) r1 = *(u8 *)(r7 +0)
invalid access to packet, off=26 size=1, R7(id=68,off=26,r=0)
R7 offset is outside of the packet

The verifier errors because it thinks R7 is outside the packet's known bounds. It tells us you're trying to make an access of size 1B at offset 26 into the packet pointer, but the packet has a known size of 0 (r=0, for range=0).

Maximum packet size limitation

That's weird because you did check the packet bounds. On instruction 2947, the packet pointer R1 is compared to R6, the pointer to the end of the packet. So following that check, the known minimum size of R1 should be updated, but it remains 0 (r=0).

That is happening because you are hitting a corner-case limitation of the verifier:

if (dst_reg->umax_value > MAX_PACKET_OFF ||
    dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
    /* Risk of overflow.  For instance, ptr + (1<<63) may be less
     * than pkt_end, but that's because it's also less than pkt.
     */
    return;

As explained in the comment, this check is here to prevent overflows. Since R1's unsigned maximum value is 73851 (umax_value=73851), the condition is true and the packet's known size is not updated.

A way to prevent this from happening might be to ensure there's an additional bounds check on R1. For example:

#define MAX_PACKET_OFF 0xffff
...
if (nh->pos + size > MAX_PACKET_OFF)
     return INV_RET_U32;

Why is R1's unsigned maximum value so high?

R1 comes from R7, which is initialized on those instructions:

2934: (79) r1 = *(u64 *)(r10 -32)
2935: (79) r2 = *(u64 *)(r10 -40)
2936: (0f) r2 += r1
; if (nh->pos + size < data_end)
2937: (57) r2 &= 65535
2938: (bf) r7 = r8
2939: (0f) r7 += r2
; if (nh->pos + size < data_end)
2940: (3d) if r7 >= r6 goto pc-2881
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_w=inv(id=0) R2_w=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=pkt_end(id=0,off=0,imm=0) R7_w=pkt(id=68,off=26,r=0,umin_value=20,umax_value=73851,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=55,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32=mmmmmmmm fp-40=inv fp-48=pkt

Two values are retrieved from the stack, at offsets -32 and -40. Those two values added hold variable size. Since size is a __u16, it is ANDed with 65535 (the maximum __u16 value). So the verifier identifies R2 has having maximum value 65535.

When R2 is added to R7, R7's maximum value of course becomes larger than MAX_PACKET_OFF = 65535.

Shouldn't the verifier understand that size < 516?

The following code ensures size will never be larger than 516 (512 + 4 in the worst case):

__u16 size = parse_sctp_chunk_size(nh->pos, data_end);
if (size > 512)
  return INV_RET_U32;

//Adjust for padding
size += (size % 4) == 0 ? 0 : 4 - size % 4;

So why is the verifier loosing track of that?

Part of variable size is saved on the stack, at offset -32, here:

2782: (69) r2 = *(u16 *)(r8 +2)
2783: (dc) r2 = be16 r2
2784: (7b) *(u64 *)(r10 -32) = r2
; if (size > 512)
2785: (25) if r2 > 0x200 goto pc-2726
 R0=map_value(id=0,off=0,ks=4,vs=2,imm=0) R1_w=inv(id=66,umax_value=255,var_off=(0x0; 0xff)) R2_w=inv(id=0,umax_value=512,var_off=(0x0; 0xffffffff)) R6=pkt_end(id=0,off=0,imm=0) R7=pkt(id=65,off=27,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R8=pkt(id=65,off=26,r=30,umin_value=20,umax_value=8316,var_off=(0x0; 0xffffffff)) R9=invP(id=0,umax_value=516,var_off=(0x0; 0xffff),s32_max_value=65535,u32_max_value=65535) R10=fp0 fp-8=mmmm???? fp-24=pkt fp-32_w=mmmmmmmm fp-40=inv fp-48=pkt

Unfortunately, the value is saved on the stack before the comparison with 512 happens. Therefore, the verifier doesn't know that the value saved on the stack is smaller than 512. We can see that because of the fp-32_w=mmmmmmmm. The ms means MISC; that is, the value could be anything from the verifier's point of view.

I believe this limitation of the verifier was removed in recent Linux versions.


Why does the issue only appear with 32 iterations?

I suspect that the variable size is only saved on the stack if the program becomes really large. As long as the variable is not saved on the stack, the verifier doesn't lose track of its maximum value 516.