hwcontext_vulkan: rewrite queue picking system for the new API

This allows us to support different video ops on different queues,
as well as any other arbitrary queues we need.
This commit is contained in:
Lynne 2024-07-16 17:14:42 +02:00
parent bedfabc437
commit 8790a30882
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464

View File

@ -1028,16 +1028,22 @@ end:
} }
/* Picks the least used qf with the fewest unneeded flags, or -1 if none found */ /* Picks the least used qf with the fewest unneeded flags, or -1 if none found */
static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf, static inline int pick_queue_family(VkQueueFamilyProperties2 *qf, uint32_t num_qf,
VkQueueFlagBits flags) VkQueueFlagBits flags)
{ {
int index = -1; int index = -1;
uint32_t min_score = UINT32_MAX; uint32_t min_score = UINT32_MAX;
for (int i = 0; i < num_qf; i++) { for (int i = 0; i < num_qf; i++) {
const VkQueueFlagBits qflags = qf[i].queueFlags; VkQueueFlagBits qflags = qf[i].queueFamilyProperties.queueFlags;
/* Per the spec, reporting transfer caps is optional for these 2 types */
if ((flags & VK_QUEUE_TRANSFER_BIT) &&
(qflags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)))
qflags |= VK_QUEUE_TRANSFER_BIT;
if (qflags & flags) { if (qflags & flags) {
uint32_t score = av_popcount(qflags) + qf[i].timestampValidBits; uint32_t score = av_popcount(qflags) + qf[i].queueFamilyProperties.timestampValidBits;
if (score < min_score) { if (score < min_score) {
index = i; index = i;
min_score = score; min_score = score;
@ -1046,7 +1052,36 @@ static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf
} }
if (index > -1) if (index > -1)
qf[index].timestampValidBits++; qf[index].queueFamilyProperties.timestampValidBits++;
return index;
}
static inline int pick_video_queue_family(VkQueueFamilyProperties2 *qf,
VkQueueFamilyVideoPropertiesKHR *qf_vid, uint32_t num_qf,
VkVideoCodecOperationFlagBitsKHR flags)
{
int index = -1;
uint32_t min_score = UINT32_MAX;
for (int i = 0; i < num_qf; i++) {
const VkQueueFlagBits qflags = qf[i].queueFamilyProperties.queueFlags;
const VkQueueFlagBits vflags = qf_vid[i].videoCodecOperations;
if (!(qflags & (VK_QUEUE_VIDEO_ENCODE_BIT_KHR | VK_QUEUE_VIDEO_DECODE_BIT_KHR)))
continue;
if (vflags & flags) {
uint32_t score = av_popcount(vflags) + qf[i].queueFamilyProperties.timestampValidBits;
if (score < min_score) {
index = i;
min_score = score;
}
}
}
if (index > -1)
qf[index].queueFamilyProperties.timestampValidBits++;
return index; return index;
} }
@ -1054,12 +1089,12 @@ static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf
static int setup_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd) static int setup_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd)
{ {
uint32_t num; uint32_t num;
float *weights;
VkQueueFamilyProperties *qf = NULL;
VulkanDevicePriv *p = ctx->hwctx; VulkanDevicePriv *p = ctx->hwctx;
AVVulkanDeviceContext *hwctx = &p->p; AVVulkanDeviceContext *hwctx = &p->p;
FFVulkanFunctions *vk = &p->vkctx.vkfn; FFVulkanFunctions *vk = &p->vkctx.vkfn;
int graph_index, comp_index, tx_index, enc_index, dec_index;
VkQueueFamilyProperties2 *qf = NULL;
VkQueueFamilyVideoPropertiesKHR *qf_vid = NULL;
/* First get the number of queue families */ /* First get the number of queue families */
vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &num, NULL); vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &num, NULL);
@ -1069,118 +1104,155 @@ static int setup_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd)
} }
/* Then allocate memory */ /* Then allocate memory */
qf = av_malloc_array(num, sizeof(VkQueueFamilyProperties)); qf = av_malloc_array(num, sizeof(VkQueueFamilyProperties2));
if (!qf) if (!qf)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
qf_vid = av_malloc_array(num, sizeof(VkQueueFamilyVideoPropertiesKHR));
if (!qf_vid)
return AVERROR(ENOMEM);
for (uint32_t i = 0; i < num; i++) {
qf_vid[i] = (VkQueueFamilyVideoPropertiesKHR) {
.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_VIDEO_PROPERTIES_KHR,
};
qf[i] = (VkQueueFamilyProperties2) {
.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2,
.pNext = &qf_vid[i],
};
}
/* Finally retrieve the queue families */ /* Finally retrieve the queue families */
vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &num, qf); vk->GetPhysicalDeviceQueueFamilyProperties2(hwctx->phys_dev, &num, qf);
av_log(ctx, AV_LOG_VERBOSE, "Queue families:\n"); av_log(ctx, AV_LOG_VERBOSE, "Queue families:\n");
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
av_log(ctx, AV_LOG_VERBOSE, " %i:%s%s%s%s%s%s%s (queues: %i)\n", i, av_log(ctx, AV_LOG_VERBOSE, " %i:%s%s%s%s%s%s%s (queues: %i)\n", i,
((qf[i].queueFlags) & VK_QUEUE_GRAPHICS_BIT) ? " graphics" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_GRAPHICS_BIT) ? " graphics" : "",
((qf[i].queueFlags) & VK_QUEUE_COMPUTE_BIT) ? " compute" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_COMPUTE_BIT) ? " compute" : "",
((qf[i].queueFlags) & VK_QUEUE_TRANSFER_BIT) ? " transfer" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_TRANSFER_BIT) ? " transfer" : "",
((qf[i].queueFlags) & VK_QUEUE_VIDEO_ENCODE_BIT_KHR) ? " encode" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_VIDEO_ENCODE_BIT_KHR) ? " encode" : "",
((qf[i].queueFlags) & VK_QUEUE_VIDEO_DECODE_BIT_KHR) ? " decode" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_VIDEO_DECODE_BIT_KHR) ? " decode" : "",
((qf[i].queueFlags) & VK_QUEUE_SPARSE_BINDING_BIT) ? " sparse" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_SPARSE_BINDING_BIT) ? " sparse" : "",
((qf[i].queueFlags) & VK_QUEUE_PROTECTED_BIT) ? " protected" : "", ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_PROTECTED_BIT) ? " protected" : "",
qf[i].queueCount); qf[i].queueFamilyProperties.queueCount);
/* We use this field to keep a score of how many times we've used that /* We use this field to keep a score of how many times we've used that
* queue family in order to make better choices. */ * queue family in order to make better choices. */
qf[i].timestampValidBits = 0; qf[i].queueFamilyProperties.timestampValidBits = 0;
} }
hwctx->nb_qf = 0;
/* Pick each queue family to use */ /* Pick each queue family to use */
graph_index = pick_queue_family(qf, num, VK_QUEUE_GRAPHICS_BIT); #define PICK_QF(type, vid_op) \
comp_index = pick_queue_family(qf, num, VK_QUEUE_COMPUTE_BIT); do { \
tx_index = pick_queue_family(qf, num, VK_QUEUE_TRANSFER_BIT); uint32_t i; \
enc_index = pick_queue_family(qf, num, VK_QUEUE_VIDEO_ENCODE_BIT_KHR); uint32_t idx; \
dec_index = pick_queue_family(qf, num, VK_QUEUE_VIDEO_DECODE_BIT_KHR); \
if (vid_op) \
idx = pick_video_queue_family(qf, qf_vid, num, vid_op); \
else \
idx = pick_queue_family(qf, num, type); \
\
if (idx == -1) \
continue; \
\
for (i = 0; i < hwctx->nb_qf; i++) { \
if (hwctx->qf[i].idx == idx) { \
hwctx->qf[i].flags |= type; \
hwctx->qf[i].video_caps |= vid_op; \
break; \
} \
} \
if (i == hwctx->nb_qf) { \
hwctx->qf[i].idx = idx; \
hwctx->qf[i].num = qf[idx].queueFamilyProperties.queueCount; \
hwctx->qf[i].flags = type; \
hwctx->qf[i].video_caps = vid_op; \
hwctx->nb_qf++; \
} \
} while (0)
/* Signalling the transfer capabilities on a queue family is optional */ PICK_QF(VK_QUEUE_GRAPHICS_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR);
if (tx_index < 0) { PICK_QF(VK_QUEUE_COMPUTE_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR);
tx_index = pick_queue_family(qf, num, VK_QUEUE_COMPUTE_BIT); PICK_QF(VK_QUEUE_TRANSFER_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR);
if (tx_index < 0)
tx_index = pick_queue_family(qf, num, VK_QUEUE_GRAPHICS_BIT); PICK_QF(VK_QUEUE_VIDEO_ENCODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_ENCODE_H264_BIT_KHR);
PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR);
PICK_QF(VK_QUEUE_VIDEO_ENCODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_ENCODE_H265_BIT_KHR);
PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR);
PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_AV1_BIT_KHR);
av_free(qf);
av_free(qf_vid);
#undef PICK_QF
cd->pQueueCreateInfos = av_malloc_array(hwctx->nb_qf,
sizeof(VkDeviceQueueCreateInfo));
if (!cd->pQueueCreateInfos)
return AVERROR(ENOMEM);
for (uint32_t i = 0; i < hwctx->nb_qf; i++) {
int dup = 0;
float *weights = NULL;
VkDeviceQueueCreateInfo *pc;
for (uint32_t j = 0; j < cd->queueCreateInfoCount; j++) {
if (hwctx->qf[i].idx == cd->pQueueCreateInfos[j].queueFamilyIndex) {
dup = 1;
break;
}
}
if (dup)
continue;
weights = av_malloc_array(hwctx->qf[i].num, sizeof(float));
if (!weights) {
for (uint32_t j = 0; j < cd->queueCreateInfoCount; j++)
av_free((void *)cd->pQueueCreateInfos[i].pQueuePriorities);
av_free((void *)cd->pQueueCreateInfos);
return AVERROR(ENOMEM);
} }
for (uint32_t j = 0; j < hwctx->qf[i].num; j++)
weights[j] = 1.0;
pc = (VkDeviceQueueCreateInfo *)cd->pQueueCreateInfos;
pc[cd->queueCreateInfoCount++] = (VkDeviceQueueCreateInfo) {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = hwctx->qf[i].idx,
.queueCount = hwctx->qf[i].num,
.pQueuePriorities = weights,
};
}
/* Setup deprecated fields */
hwctx->queue_family_index = -1; hwctx->queue_family_index = -1;
hwctx->queue_family_comp_index = -1; hwctx->queue_family_comp_index = -1;
hwctx->queue_family_tx_index = -1; hwctx->queue_family_tx_index = -1;
hwctx->queue_family_encode_index = -1; hwctx->queue_family_encode_index = -1;
hwctx->queue_family_decode_index = -1; hwctx->queue_family_decode_index = -1;
#define SETUP_QUEUE(qf_idx) \ #define SET_OLD_QF(field, nb_field, type) \
if (qf_idx > -1) { \ do { \
int fidx = qf_idx; \ if (field < 0 && hwctx->qf[i].flags & type) { \
int qc = qf[fidx].queueCount; \ field = hwctx->qf[i].idx; \
VkDeviceQueueCreateInfo *pc; \ nb_field = hwctx->qf[i].num; \
\
if (fidx == graph_index) { \
hwctx->queue_family_index = fidx; \
hwctx->nb_graphics_queues = qc; \
graph_index = -1; \
} \ } \
if (fidx == comp_index) { \ } while (0)
hwctx->queue_family_comp_index = fidx; \
hwctx->nb_comp_queues = qc; \ for (uint32_t i = 0; i < hwctx->nb_qf; i++) {
comp_index = -1; \ SET_OLD_QF(hwctx->queue_family_index, hwctx->nb_graphics_queues, VK_QUEUE_GRAPHICS_BIT);
} \ SET_OLD_QF(hwctx->queue_family_comp_index, hwctx->nb_comp_queues, VK_QUEUE_COMPUTE_BIT);
if (fidx == tx_index) { \ SET_OLD_QF(hwctx->queue_family_tx_index, hwctx->nb_tx_queues, VK_QUEUE_TRANSFER_BIT);
hwctx->queue_family_tx_index = fidx; \ SET_OLD_QF(hwctx->queue_family_encode_index, hwctx->nb_encode_queues, VK_QUEUE_VIDEO_ENCODE_BIT_KHR);
hwctx->nb_tx_queues = qc; \ SET_OLD_QF(hwctx->queue_family_decode_index, hwctx->nb_decode_queues, VK_QUEUE_VIDEO_DECODE_BIT_KHR);
tx_index = -1; \
} \
if (fidx == enc_index) { \
hwctx->queue_family_encode_index = fidx; \
hwctx->nb_encode_queues = qc; \
enc_index = -1; \
} \
if (fidx == dec_index) { \
hwctx->queue_family_decode_index = fidx; \
hwctx->nb_decode_queues = qc; \
dec_index = -1; \
} \
\
pc = av_realloc((void *)cd->pQueueCreateInfos, \
sizeof(*pc) * (cd->queueCreateInfoCount + 1)); \
if (!pc) { \
av_free(qf); \
return AVERROR(ENOMEM); \
} \
cd->pQueueCreateInfos = pc; \
pc = &pc[cd->queueCreateInfoCount]; \
\
weights = av_malloc(qc * sizeof(float)); \
if (!weights) { \
av_free(qf); \
return AVERROR(ENOMEM); \
} \
\
memset(pc, 0, sizeof(*pc)); \
pc->sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; \
pc->queueFamilyIndex = fidx; \
pc->queueCount = qc; \
pc->pQueuePriorities = weights; \
\
for (int i = 0; i < qc; i++) \
weights[i] = 1.0f / qc; \
\
cd->queueCreateInfoCount++; \
} }
SETUP_QUEUE(graph_index) #undef SET_OLD_QF
SETUP_QUEUE(comp_index)
SETUP_QUEUE(tx_index)
SETUP_QUEUE(enc_index)
SETUP_QUEUE(dec_index)
#undef SETUP_QUEUE
av_free(qf);
return 0; return 0;
} }