summaryrefslogtreecommitdiff
path: root/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
blob: 00dbfc9937d3f3a296b262561946c265ca0dded5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
/* $Id$ */
/** @file
 * IPRT - I/O queue, Linux io_uring interface I/O file provider.
 */

/*
 * Copyright (C) 2019-2023 Oracle and/or its affiliates.
 *
 * This file is part of VirtualBox base platform packages, as
 * available from https://www.virtualbox.org.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, in version 3 of the
 * License.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <https://www.gnu.org/licenses>.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
 * in the VirtualBox distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 *
 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
 */

/** @page pg_rtioqueue_linux     RTIoQueue - Linux io_uring implementation notes
 * @internal
 *
 * The io_uring interface is the most recent interface added to the Linux kernel
 * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
 * thus not available on most systems as of writing this backend (July 2019).
 * It supersedes the old async I/O interface and cleans up with some restrictions like
 * having to disable caching for the file.
 * The interface is centered around a submission and completion queue to queue multiple new
 * requests for the kernel to process and get notified about completions to reduce the amount
 * of context switches to an absolute minimum. It also offers advanced features like
 * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
 * even more.
 *
 * The first implementation will only make use of the basic features and more advanced features
 * will be added later.
 * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
 * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
 * while still keeping a consistent platform independent API which allows efficient implementations on
 * other hosts when they come up.
 *
 * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
 * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
 * for the Linux RTFileAio* API  The necessary interface definitions and descriptions where retrieved from:
 *     * http://kernel.dk/io_uring.pdf
 *     * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
 */


/*********************************************************************************************************************************
*   Header Files                                                                                                                 *
*********************************************************************************************************************************/
#define LOG_GROUP RTLOGGROUP_IOQUEUE
#include <iprt/ioqueue.h>

#include <iprt/assertcompile.h>
#include <iprt/asm.h>
#include <iprt/errcore.h>
#include <iprt/file.h>
#include <iprt/log.h>
#include <iprt/mem.h>
#include <iprt/string.h>

#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/uio.h>

#include "internal/ioqueue.h"


/*********************************************************************************************************************************
*   Defined Constants And Macros                                                                                                 *
*********************************************************************************************************************************/

/** The syscall number of io_uring_setup(). */
#define LNX_IOURING_SYSCALL_SETUP     425
/** The syscall number of io_uring_enter(). */
#define LNX_IOURING_SYSCALL_ENTER     426
/** The syscall number of io_uring_register(). */
#define LNX_IOURING_SYSCALL_REGISTER  427
/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
#define LNX_SYSCALL_EVENTFD2          290


/*********************************************************************************************************************************
*   Structures and Typedefs                                                                                                      *
*********************************************************************************************************************************/

/**
 * Linux io_uring completion event.
 */
typedef struct LNXIOURINGCQE
{
    /** Opaque user data associated with the completed request. */
    uint64_t                    u64User;
    /** The status code of the request. */
    int32_t                     rcLnx;
    /** Some flags which are not used as of now. */
    uint32_t                    fFlags;
} LNXIOURINGCQE;
AssertCompileSize(LNXIOURINGCQE, 16);
/** Pointer to a Linux io_uring completion event. */
typedef LNXIOURINGCQE *PLNXIOURINGCQE;
/** Pointer to a constant linux io_uring completion event. */
typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;


/**
 * Linux io_uring submission queue entry.
 */
typedef struct LNXIOURINGSQE
{
    /** The opcode for the request. */
    uint8_t                     u8Opc;
    /** Common flags for the request. */
    uint8_t                     u8Flags;
    /** Assigned I/O priority. */
    uint16_t                    u16IoPrio;
    /** The file descriptor the request is for. */
    int32_t                     i32Fd;
    /** The start offset into the file for the request. */
    uint64_t                    u64OffStart;
    /** Buffer pointer or Pointer to io vector array depending on opcode. */
    uint64_t                    u64AddrBufIoVec;
    /** Size of the buffer in bytes or number of io vectors. */
    uint32_t                    u32BufIoVecSz;
    /** Opcode dependent data. */
    union
    {
        /** Flags for read/write requests. */
        uint32_t                u32KrnlRwFlags;
        /** Flags for fsync() like requests. */
        uint32_t                u32FsyncFlags;
        /** Flags for poll() like requests. */
        uint16_t                u16PollFlags;
        /** Flags for sync_file_range() like requests. */
        uint32_t                u32SyncFileRangeFlags;
        /** Flags for requests requiring a msg structure. */
        uint32_t                u32MsgFlags;
    } uOpc;
    /** Opaque user data associated with the request and returned durign completion. */
    uint64_t                    u64User;
    /** Request type dependent data. */
    union
    {
        /** Fixed buffer index if indicated by the request flags. */
        uint16_t                u16FixedBufIdx;
        /** Padding to align the structure to 64 bytes. */
        uint64_t                au64Padding[3];
    } uReq;
} LNXIOURINGSQE;
AssertCompileSize(LNXIOURINGSQE, 64);
/** Pointer to a Linux io_uring submission queue entry. */
typedef LNXIOURINGSQE *PLNXIOURINGSQE;
/** Pointer to a constant Linux io_uring submission queue entry. */
typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;


/**
 * Linux u_ioring SQ ring header structure to maintain the queue.
 */
typedef struct LNXIOURINGSQ
{
    /** The current head position to fill in new requests. */
    uint32_t                    u32OffHead;
    /** The current tail position the kernel starts processing from. */
    uint32_t                    u32OffTail;
    /** The mask for the head and tail counters to apply to retrieve the index. */
    uint32_t                    u32OffRingMask;
    /** Number of entries in the SQ ring. */
    uint32_t                    u32OffRingEntries;
    /** Flags set asychronously by the kernel. */
    uint32_t                    u32OffFlags;
    /** Counter of dropped requests. */
    uint32_t                    u32OffDroppedReqs;
    /** Offset where to find the array of SQ entries. */
    uint32_t                    u32OffArray;
    /** Reserved. */
    uint32_t                    u32Rsvd0;
    /** Reserved. */
    uint64_t                    u64Rsvd1;
} LNXIOURINGSQ;
AssertCompileSize(LNXIOURINGSQ, 40);
/** Pointer to a Linux u_ioring SQ ring header. */
typedef LNXIOURINGSQ *PLNXIOURINGSQ;
/** Pointer to a constant Linux u_ioring SQ ring header. */
typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;


/**
 * Linux io_uring CQ ring header structure to maintain the queue.
 */
typedef struct LNXIOURINGCQ
{
    /** The current head position the kernel modifies when completion events happen. */
    uint32_t                    u32OffHead;
    /** The current tail position to read completion events from. */
    uint32_t                    u32OffTail;
    /** The mask for the head and tail counters to apply to retrieve the index. */
    uint32_t                    u32OffRingMask;
    /** Number of entries in the CQ ring. */
    uint32_t                    u32OffRingEntries;
    /** Number of CQ overflows happened. */
    uint32_t                    u32OffOverflowCnt;
    /** */
    uint32_t                    u32OffCqes;
    /** Reserved. */
    uint64_t                    au64Rsvd0[2];
} LNXIOURINGCQ;
AssertCompileSize(LNXIOURINGCQ, 40);
/** Pointer to a Linux u_ioring CQ ring header. */
typedef LNXIOURINGCQ *PLNXIOURINGCQ;
/** Pointer to a constant Linux u_ioring CQ ring header. */
typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;


/**
 * Linux io_uring parameters passed to io_uring_setup().
 */
typedef struct LNXIOURINGPARAMS
{
    /** Number of SQ entries requested, must be power of 2. */
    uint32_t                    u32SqEntriesCnt;
    /** Number of CQ entries requested, must be power of 2. */
    uint32_t                    u32CqEntriesCnt;
    /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
    uint32_t                    u32Flags;
    /** Affinity of the kernel side SQ polling thread if enabled. */
    uint32_t                    u32SqPollCpu;
    /** Milliseconds after the kernel side SQ polling thread goes to sleep
     * if there is are no requests to process. */
    uint32_t                    u32SqPollIdleMs;
    /** Reserved. */
    uint32_t                    au32Rsvd0[5];
    /** Offsets returned for the submission queue. */
    LNXIOURINGSQ                SqOffsets;
    /** Offsets returned for the completion queue. */
    LNXIOURINGCQ                CqOffsets;
} LNXIOURINGPARAMS;
/** Pointer to Linux io_uring parameters. */
typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
/** Pointer to constant Linux io_uring parameters. */
typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;


/** @name LNXIOURINGSQE::u8Opc defined opcodes.
 * @{ */
/** Opcode to profile the interface, does nothing. */
#define LNX_IOURING_OPC_NOP             0
/** preadv() like request. */
#define LNX_IOURING_OPC_READV           1
/** pwritev() like request. */
#define LNX_IOURING_OPC_WRITEV          2
/** fsync() like request. */
#define LNX_IOURING_OPC_FSYNC           3
/** Read request using a fixed preset buffer. */
#define LNX_IOURING_OPC_READ_FIXED      4
/** Write request using a fixed preset buffer. */
#define LNX_IOURING_OPC_WRITE_FIXED     5
/** Add file descriptor to pollset. */
#define LNX_IOURING_OPC_POLL_ADD        6
/** Remove file descriptor from pollset. */
#define LNX_IOURING_OPC_POLL_REMOVE     7
/** sync_file_range() like request. */
#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
/** sendmsg() like request. */
#define LNX_IOURING_OPC_SENDMSG         9
/** recvmsg() like request. */
#define LNX_IOURING_OPC_RECVMSG         10
/** @} */


/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
 * @{ */
/** Sync userdata as well instead of metadata only. */
#define LNX_IOURING_OPC_FSYNC_DATASYNC  RT_BIT_32(0)
/** @} */


/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
 * @{ */
/** The I/O context is polled. */
#define LNX_IOURING_SETUP_F_IOPOLL      RT_BIT_32(0)
/** The kernel should poll the submission queue. */
#define LNX_IOURING_SETUP_F_SQPOLL      RT_BIT_32(1)
/** Sets the CPU affinity of the kernel thread polling the submission queue. */
#define LNX_IOURING_SETUP_F_SQAFF       RT_BIT_32(2)
/** @} */


/** @name Flags for LNXIOURINGSQE::u8Flags.
 * @{ */
/** The file descriptor was registered before use. */
#define LNX_IOURING_SQE_F_FIXED_FILE    RT_BIT(0)
/** Complete all active requests before issuing the request with the flag set. */
#define LNX_IOURING_SQE_F_IO_DRAIN      RT_BIT(1)
/** Links the request with the flag set to the next one. */
#define LNX_IOURING_SQE_F_IO_LINK       RT_BIT(2)
/** @} */


/** @name Magic mmap offsets to map submission and completion queues.
 * @{ */
/** Used to map the submission queue. */
#define LNX_IOURING_MMAP_OFF_SQ         UINT64_C(0)
/** Used to map the completion queue. */
#define LNX_IOURING_MMAP_OFF_CQ         UINT64_C(0x8000000)
/** Used to map the submission queue entries array. */
#define LNX_IOURING_MMAP_OFF_SQES       UINT64_C(0x10000000)
/** @} */


/** @name Flags used for the SQ ring structure.
 * @{ */
/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP           RT_BIT_32(0)
/** @} */


/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
 * @{ */
/** Retrieve completion events for the completion queue. */
#define LNX_IOURING_ENTER_F_GETEVENTS               RT_BIT_32(0)
/** Wakes the suspended kernel thread processing the requests. */
#define LNX_IOURING_ENTER_F_SQ_WAKEUP               RT_BIT_32(1)
/** @} */


/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
 * @{ */
/** Register a fixed set of buffers. */
#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER   0
/** Unregisters a fixed set of buffers registered previously. */
#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
/** Register a fixed set of files. */
#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER     2
/** Unregisters a fixed set of files registered previously. */
#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER   3
/** Register an eventfd associated with the I/O ring. */
#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER   4
/** Unregisters an eventfd registered previously. */
#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
/** @} */


/**
 * SQ ring structure.
 *
 * @note Some members of this structure point to memory shared with the kernel,
 *       hence the volatile keyword.
 */
typedef struct RTIOQUEUESQ
{
    /** Pointer to the head counter. */
    volatile uint32_t           *pidxHead;
    /** Pointer to the tail counter. */
    volatile uint32_t           *pidxTail;
    /** Mask to apply for the counters to get to the index. */
    uint32_t                    fRingMask;
    /** Number of entries in the ring. */
    uint32_t                    cEntries;
    /** Pointer to the global flags. */
    volatile uint32_t           *pfFlags;
    /** Pointer to the indirection array used for indexing the real SQ entries. */
    volatile uint32_t           *paidxSqes;
} RTIOQUEUESQ;


/**
 * CQ ring structure.
 *
 * @note Some members of this structure point to memory shared with the kernel,
 *       hence the volatile keyword.
 */
typedef struct RTIOQUEUECQ
{
    /** Pointer to the head counter. */
    volatile uint32_t           *pidxHead;
    /** Pointer to the tail counter. */
    volatile uint32_t           *pidxTail;
    /** Mask to apply for the counters to get to the index. */
    uint32_t                    fRingMask;
    /** Number of entries in the ring. */
    uint32_t                    cEntries;
    /** Pointer to the completion entry ring. */
    volatile LNXIOURINGCQE      *paCqes;
} RTIOQUEUECQ;


/**
 * Internal I/O queue provider instance data.
 */
typedef struct RTIOQUEUEPROVINT
{
    /** The io_uring file descriptor. */
    int                         iFdIoCtx;
    /** The eventfd file descriptor registered with the ring. */
    int                         iFdEvt;
    /** The submission queue. */
    RTIOQUEUESQ                 Sq;
    /** The currently uncommitted tail for the SQ. */
    uint32_t                    idxSqTail;
    /** Numbere of uncommitted SQEs. */
    uint32_t                    cSqesToCommit;
    /** The completion queue. */
    RTIOQUEUECQ                 Cq;
    /** Pointer to the mapped SQES entries. */
    PLNXIOURINGSQE              paSqes;
    /** Pointer to the iovec structure used for non S/G requests. */
    struct iovec                *paIoVecs;
    /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
    void                        *pvMMapSqRing;
    /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
    void                        *pvMMapCqRing;
    /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
    void                        *pvMMapSqes;
    /** Size of the mapped SQ ring, used for unmapping. */
    size_t                      cbMMapSqRing;
    /** Size of the mapped CQ ring, used for unmapping. */
    size_t                      cbMMapCqRing;
    /** Size of the mapped SQ entries array, used for unmapping. */
    size_t                      cbMMapSqes;
    /** Flag whether the waiter was woken up externally. */
    volatile bool               fExtIntr;
} RTIOQUEUEPROVINT;
/** Pointer to the internal I/O queue provider instance data. */
typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;


/*********************************************************************************************************************************
*   Internal Functions                                                                                                           *
*********************************************************************************************************************************/

/**
 * Syscall wrapper for io_uring_setup().
 *
 * @returns IPRT status code.
 * @param   cEntries            Number of entries for submission and completion queues.
 * @param   pParams             Additional parameters for the I/O ring and updated return values
 *                              on success.
 * @param   piFdIoCtx           Where to store the file descriptor of the I/O ring on success.
 */
DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
{
    int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
    if (RT_UNLIKELY(rcLnx == -1))
        return RTErrConvertFromErrno(errno);

    *piFdIoCtx = rcLnx;
    return VINF_SUCCESS;
}


/**
 * Syscall wrapper for io_uring_enter().
 *
 * @returns IPRT status code.
 * @param   iFdIoCtx            The I/O ring file descriptor.
 * @param   cToSubmit           Maximum number of requests waiting for processing.
 * @param   cMinComplete        Minimum number of completion events to accumulate before returning.
 * @param   fFlags              Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
 */
DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
                                         uint32_t fFlags)
{
    int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
                        NULL, 0);
    if (RT_UNLIKELY(rcLnx == -1))
        return RTErrConvertFromErrno(errno);

    return VINF_SUCCESS;
}


/**
 * Syscall wrapper for io_uring_register().
 *
 * @returns IPRT status code.
 * @param   iFdIoCtx            The I/O ring file descriptor.
 * @param   uOpc                Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
 * @param   pvArg               Opaque arguments.
 * @param   cArgs               Number of arguments.
 */
DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
                                            uint32_t cArgs)
{
    int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
    if (RT_UNLIKELY(rcLnx == -1))
        return RTErrConvertFromErrno(errno);

    return VINF_SUCCESS;
}


/**
 * mmap() wrapper for the common bits and returning an IPRT status code.
 *
 * @returns IPRT status code.
 * @param   iFdIoCtx            The I/O ring file descriptor.
 * @param   offMmap             The mmap() offset.
 * @param   cbMmap              How much to map.
 * @param   ppv                 Where to store the pointer to the mapping on success.
 */
DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
{
    void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
    if (pv != MAP_FAILED)
    {
        *ppv = pv;
        return VINF_SUCCESS;
    }

    return RTErrConvertFromErrno(errno);
}


/**
 * eventfd2() syscall wrapper.
 *
 * @returns IPRT status code.
 * @param   uValInit            The initial value of the maintained counter.
 * @param   fFlags              Flags controlling the eventfd behavior.
 * @param   piFdEvt             Where to store the file descriptor of the eventfd object on success.
 */
DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
{
    int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
    if (RT_UNLIKELY(rcLnx == -1))
        return RTErrConvertFromErrno(errno);

    *piFdEvt = rcLnx;
    return VINF_SUCCESS;
}


/**
 * Checks the completion event queue for pending events.
 *
 * @returns nothing.
 * @param   pThis               The provider instance.
 * @param   paCEvt              Pointer to the array of completion events.
 * @param   cCEvt               Maximum number of completion events the array can hold.
 * @param   pcCEvtSeen          Where to store the number of completion events processed.
 */
static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
                                               uint32_t cCEvt, uint32_t *pcCEvtSeen)
{
    /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
    ASMReadFence();
    uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
    uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
    ASMReadFence();

    uint32_t cCEvtSeen = 0;

    while (   idxCqTail != idxCqHead
           && cCEvtSeen < cCEvt)
    {
        /* Get the index. */
        uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
        volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];

        paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
        if (pCqe->rcLnx >= 0)
        {
            paCEvt->rcReq    = VINF_SUCCESS;
            paCEvt->cbXfered = (size_t)pCqe->rcLnx;
        }
        else
            paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);

#ifdef RT_STRICT /* poison */
        memset((void *)pCqe, 0xff, sizeof(*pCqe));
#endif

        paCEvt++;
        cCEvtSeen++;
        idxCqHead++;
    }

    *pcCEvtSeen = cCEvtSeen;

    /* Paranoia strikes again. */
    ASMWriteFence();
    ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
    ASMWriteFence();
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
{
    /*
     * Try to create a simple I/O ring and close it again.
     * The common code/public API already checked for the proper handle type.
     */
    int iFdIoCtx = 0;
    bool fSupp = false;
    LNXIOURINGPARAMS Params;
    RT_ZERO(Params);

    int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
    if (RT_SUCCESS(rc))
    {
        /*
         * Check that we can register an eventfd descriptor to get notified about
         * completion events while being able to kick the waiter externally out of the wait.
         */
        int iFdEvt = 0;
        rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
        if (RT_SUCCESS(rc))
        {
            rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
                                             &iFdEvt, 1 /*cArgs*/);
            if (RT_SUCCESS(rc))
                fSupp = true;

            int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
        }
        int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
    }

    return fSupp;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
                                                               uint32_t cSqEntries, uint32_t cCqEntries)
{
    RT_NOREF(fFlags, cCqEntries);

    PRTIOQUEUEPROVINT pThis = hIoQueueProv;
    LNXIOURINGPARAMS Params;
    RT_ZERO(Params);

    pThis->cSqesToCommit = 0;
    pThis->fExtIntr      = false;

    int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
    if (RT_SUCCESS(rc))
    {
        /* Map the rings into userspace. */
        pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
        pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
        pThis->cbMMapSqes   = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);

        pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
        if (RT_LIKELY(pThis->paIoVecs))
        {
            rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
            if (RT_SUCCESS(rc))
            {
                rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
                if (RT_SUCCESS(rc))
                {
                    rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
                    if (RT_SUCCESS(rc))
                    {
                        rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
                        if (RT_SUCCESS(rc))
                        {
                            rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
                            if (RT_SUCCESS(rc))
                            {
                                uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;

                                pThis->Sq.pidxHead  = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
                                pThis->Sq.pidxTail  = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
                                pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
                                pThis->Sq.cEntries  = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
                                pThis->Sq.pfFlags   = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
                                pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
                                pThis->idxSqTail    = *pThis->Sq.pidxTail;

                                pThis->paSqes       = (PLNXIOURINGSQE)pThis->pvMMapSqes;

                                pbTmp = (uint8_t *)pThis->pvMMapCqRing;

                                pThis->Cq.pidxHead  = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
                                pThis->Cq.pidxTail  = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
                                pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
                                pThis->Cq.cEntries  = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
                                pThis->Cq.paCqes    = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
                                return VINF_SUCCESS;
                            }

                            munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
                        }

                        munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
                    }

                    rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
                    AssertRC(rc);
                }

                close(pThis->iFdEvt);
            }

            RTMemFree(pThis->paIoVecs);
        }

        int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
    }

    return rc;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
{
    PRTIOQUEUEPROVINT pThis = hIoQueueProv;

    int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
    rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
    rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);

    int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
    AssertRC(rc);

    close(pThis->iFdEvt);
    close(pThis->iFdIoCtx);
    RTMemFree(pThis->paIoVecs);

    RT_ZERO(pThis);
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
{
    RT_NOREF(hIoQueueProv, pHandle);
    /** @todo Add support for fixed file sets later. */
    return VINF_SUCCESS;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
{
    RT_NOREF(hIoQueueProv, pHandle);
    /** @todo Add support for fixed file sets later. */
    return VINF_SUCCESS;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
                                                                uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
                                                                void *pvUser)
{
    PRTIOQUEUEPROVINT pThis = hIoQueueProv;
    RT_NOREF(fReqFlags);

    uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
    PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
    struct iovec *pIoVec = &pThis->paIoVecs[idx];

    pIoVec->iov_base = pvBuf;
    pIoVec->iov_len  = cbBuf;

    pSqe->u8Flags         = 0;
    pSqe->u16IoPrio       = 0;
    pSqe->i32Fd           = (int32_t)RTFileToNative(pHandle->u.hFile);
    pSqe->u64OffStart     = off;
    pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
    pSqe->u32BufIoVecSz   = 1;
    pSqe->u64User         = (uint64_t)(uintptr_t)pvUser;

    switch (enmOp)
    {
        case RTIOQUEUEOP_READ:
            pSqe->u8Opc               = LNX_IOURING_OPC_READV;
            pSqe->uOpc.u32KrnlRwFlags = 0;
            break;
        case RTIOQUEUEOP_WRITE:
            pSqe->u8Opc               = LNX_IOURING_OPC_WRITEV;
            pSqe->uOpc.u32KrnlRwFlags = 0;
            break;
        case RTIOQUEUEOP_SYNC:
            pSqe->u8Opc              = LNX_IOURING_OPC_FSYNC;
            pSqe->uOpc.u32FsyncFlags = 0;
            break;
        default:
            AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
                                  VERR_INVALID_PARAMETER);
    }

    pThis->Sq.paidxSqes[idx] = idx;
    pThis->idxSqTail++;
    pThis->cSqesToCommit++;
    return VINF_SUCCESS;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
{
    PRTIOQUEUEPROVINT pThis = hIoQueueProv;

    ASMWriteFence();
    ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
    ASMWriteFence();

    int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
    if (RT_SUCCESS(rc))
    {
        *pcReqsCommitted = pThis->cSqesToCommit;
        pThis->cSqesToCommit = 0;
    }

    return rc;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
                                                             uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
{
    PRTIOQUEUEPROVINT pThis = hIoQueueProv;
    int rc = VINF_SUCCESS;
    uint32_t cCEvtSeen = 0;

    RT_NOREF(fFlags);

    /*
     * Check the completion queue first for any completed events which might save us a
     * context switch later on.
     */
    rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);

    while (   cCEvtSeen < cMinWait
           && RT_SUCCESS(rc))
    {
        /*
         * We can employ a blocking read on the event file descriptor, it will return
         * either when woken up externally or when there are completion events pending.
         */
        uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
        ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
        if (rcLnx == sizeof(uCnt))
        {
            uint32_t cCEvtThisSeen = 0;
            rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
            cCEvtSeen += cCEvtThisSeen;

            /* Whether we got woken up externally. */
            if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
                rc = VERR_INTERRUPTED;
        }
        else if (rcLnx == -1)
            rc = RTErrConvertFromErrno(errno);
        else
            AssertMsgFailed(("Unexpected read() -> 0\n"));
    }

    *pcCEvt = cCEvtSeen;
    return rc;
}


/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
{
    PRTIOQUEUEPROVINT pThis = hIoQueueProv;
    int rc = VINF_SUCCESS;

    if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
    {
        const uint64_t uValAdd = 1;
        ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));

        Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
        if (rcLnx == -1)
            rc = RTErrConvertFromErrno(errno);
    }

    return rc;
}


/**
 * Async file I/O queue provider virtual method table.
 */
RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
{
    /** uVersion */
    RTIOQUEUEPROVVTABLE_VERSION,
    /** pszId */
    "LnxIoURingFile",
    /** cbIoQueueProv */
    sizeof(RTIOQUEUEPROVINT),
    /** enmHnd */
    RTHANDLETYPE_FILE,
    /** fFlags */
    0,
    /** pfnIsSupported */
    rtIoQueueLnxIoURingFileProv_IsSupported,
    /** pfnQueueInit  */
    rtIoQueueLnxIoURingFileProv_QueueInit,
    /** pfnQueueDestroy */
    rtIoQueueLnxIoURingFileProv_QueueDestroy,
    /** pfnHandleRegister */
    rtIoQueueLnxIoURingFileProv_HandleRegister,
    /** pfnHandleDeregister */
    rtIoQueueLnxIoURingFileProv_HandleDeregister,
    /** pfnReqPrepare */
    rtIoQueueLnxIoURingFileProv_ReqPrepare,
    /** pfnReqPrepareSg */
    NULL,
    /** pfnCommit */
    rtIoQueueLnxIoURingFileProv_Commit,
    /** pfnEvtWait */
    rtIoQueueLnxIoURingFileProv_EvtWait,
    /** pfnEvtWaitWakeup */
    rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
    /** uEndMarker */
    RTIOQUEUEPROVVTABLE_VERSION
};