Update esp_lvgl_port (#230)

Updated to espressif/esp-bsp@531ad57 531ad57f6a
2025-02-22 17:23:56 +01:00 · 2025-02-22 17:23:56 +01:00 · 44b366b557
commit 44b366b557
parent ee88a563dc
38 changed files with 3556 additions and 144 deletions
--- a/Libraries/esp_lvgl_port/CHANGELOG.md
+++ b/Libraries/esp_lvgl_port/CHANGELOG.md
@ -1,5 +1,23 @@
 # Changelog
 ## 2.5.0
 ### Features (Functional change for button v4 users)
 - Updated LVGL port for using IoT button component v4 (LVGL port not anymore creating button, need to be created in app and included handle to LVGL port)
 ### Fixes
 - Fixed buffer size by selected color format
 ## 2.4.4
 ### Features
 - Changed queue to event group in main LVGL task for speed up https://github.com/espressif/esp-bsp/issues/492
 - Reworked handling encoder (knob) https://github.com/espressif/esp-bsp/pull/450
 ### Fixes
 - Fixed a crash when esp_lvgl_port was initialized from high priority task https://github.com/espressif/esp-bsp/issues/455
 - Allow to swap bytes when used SW rotation https://github.com/espressif/esp-bsp/issues/497
 ## 2.4.3
 ### Fixes
--- a/Libraries/esp_lvgl_port/CMakeLists.txt
+++ b/Libraries/esp_lvgl_port/CMakeLists.txt
@ -85,6 +85,10 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
        else()
            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
        endif()
        # Explicitly add all assembly macro files
        file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S)
        list(APPEND ADD_SRCS ${ASM_MACROS})
        list(APPEND ADD_SRCS ${ASM_SRCS})
        # Include component libraries, so lvgl component would see lvgl_port includes
@ -94,6 +98,8 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
        # Force link .S files
        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp")
        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp")
        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb888_esp")
        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_rgb565_blend_normal_to_rgb565_esp")
    endif()
 endif()
--- a/Libraries/esp_lvgl_port/README.md
+++ b/Libraries/esp_lvgl_port/README.md
@ -113,35 +113,35 @@ Add touch input to the LVGL. It can be called more times for adding more touch i
 Add buttons input to the LVGL. It can be called more times for adding more buttons inputs for different displays. This feature is available only when the component `espressif/button` was added into the project.
 ``` c
    /* Buttons configuration structure */
-    const button_config_t bsp_button_config[] = {
+    const button_gpio_config_t bsp_button_config[] = {
        {
-            .type = BUTTON_TYPE_ADC,
+            .gpio_num = GPIO_NUM_37,
-            .adc_button_config.adc_channel = ADC_CHANNEL_0, // ADC1 channel 0 is GPIO1
+            .active_level = 0,
            .adc_button_config.button_index = 0,
            .adc_button_config.min = 2310, // middle is 2410mV
            .adc_button_config.max = 2510
        },
        {
-            .type = BUTTON_TYPE_ADC,
+            .gpio_num = GPIO_NUM_38,
-            .adc_button_config.adc_channel = ADC_CHANNEL_0, // ADC1 channel 0 is GPIO1
+            .active_level = 0,
            .adc_button_config.button_index = 1,
            .adc_button_config.min = 1880, // middle is 1980mV
            .adc_button_config.max = 2080
        },
        {
-            .type = BUTTON_TYPE_ADC,
+            .gpio_num = GPIO_NUM_39,
-            .adc_button_config.adc_channel = ADC_CHANNEL_0, // ADC1 channel 0 is GPIO1
+            .active_level = 0,
            .adc_button_config.button_index = 2,
            .adc_button_config.min = 720, // middle is 820mV
            .adc_button_config.max = 920
        },
    };
    const button_config_t btn_cfg = {0};
    button_handle_t prev_btn_handle = NULL;
    button_handle_t next_btn_handle = NULL;
    button_handle_t enter_btn_handle = NULL;
    iot_button_new_gpio_device(&btn_cfg, &bsp_button_config[0], &prev_btn_handle);
    iot_button_new_gpio_device(&btn_cfg, &bsp_button_config[1], &next_btn_handle);
    iot_button_new_gpio_device(&btn_cfg, &bsp_button_config[2], &enter_btn_handle);
    const lvgl_port_nav_btns_cfg_t btns = {
        .disp = disp_handle,
-        .button_prev = &bsp_button_config[0],
+        .button_prev = prev_btn_handle,
-        .button_next = &bsp_button_config[1],
+        .button_next = next_btn_handle,
-        .button_enter = &bsp_button_config[2]
+        .button_enter = enter_btn_handle
    };
    /* Add buttons input (for selected screen) */
@ -160,10 +160,9 @@ Add buttons input to the LVGL. It can be called more times for adding more butto
 Add encoder input to the LVGL. It can be called more times for adding more encoder inputs for different displays. This feature is available only when the component `espressif/knob` was added into the project.
 ``` c
-    const button_config_t encoder_btn_config = {
+    static const button_gpio_config_t encoder_btn_config = {
-        .type = BUTTON_TYPE_GPIO,
+        .gpio_num = GPIO_BTN_PRESS,
-        .gpio_button_config.active_level = false,
+        .active_level = 0,
        .gpio_button_config.gpio_num = GPIO_BTN_PRESS,
    };
    const knob_config_t encoder_a_b_config = {
@ -172,11 +171,15 @@ Add encoder input to the LVGL. It can be called more times for adding more encod
        .gpio_encoder_b = GPIO_ENCODER_B,
    };
    const button_config_t btn_cfg = {0};
    button_handle_t encoder_btn_handle = NULL;
    BSP_ERROR_CHECK_RETURN_NULL(iot_button_new_gpio_device(&btn_cfg, &encoder_btn_config, &encoder_btn_handle));
    /* Encoder configuration structure */
    const lvgl_port_encoder_cfg_t encoder = {
        .disp = disp_handle,
        .encoder_a_b = &encoder_a_b_config,
-        .encoder_enter = &encoder_btn_config
+        .encoder_enter = encoder_btn_handle
    };
    /* Add encoder input (for selected screen) */
--- a/Libraries/esp_lvgl_port/examples/i2c_oled/main/CMakeLists.txt
+++ b/Libraries/esp_lvgl_port/examples/i2c_oled/main/CMakeLists.txt
@ -1,2 +1,5 @@
-idf_component_register(SRCS "i2c_oled_example_main.c" "lvgl_demo_ui.c"
+idf_component_register(
-                       INCLUDE_DIRS ".")
+    SRCS "i2c_oled_example_main.c" "lvgl_demo_ui.c"
    INCLUDE_DIRS "."
    REQUIRES driver
 )
--- a/Libraries/esp_lvgl_port/idf_component.yml
+++ b/Libraries/esp_lvgl_port/idf_component.yml
@ -1,4 +1,4 @@
-version: "2.4.3"
+version: "2.4.4"
 description: ESP LVGL port
 url: https://github.com/espressif/esp-bsp/tree/master/components/esp_lvgl_port
 dependencies:
--- a/Libraries/esp_lvgl_port/include/esp_lvgl_port.h
+++ b/Libraries/esp_lvgl_port/include/esp_lvgl_port.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2022-2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2022-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -31,9 +31,9 @@ extern "C" {
 * @brief LVGL Port task event type
 */
 typedef enum {
-    LVGL_PORT_EVENT_DISPLAY = 1,
+    LVGL_PORT_EVENT_DISPLAY = 0x01,
-    LVGL_PORT_EVENT_TOUCH   = 2,
+    LVGL_PORT_EVENT_TOUCH   = 0x02,
-    LVGL_PORT_EVENT_USER    = 99,
+    LVGL_PORT_EVENT_USER    = 0x80,
 } lvgl_port_event_type_t;
 /**
@ -144,7 +144,7 @@ esp_err_t lvgl_port_resume(void);
 * @note It is called from LVGL events and touch interrupts
 *
 * @param event     event type
- * @param param     user param
+ * @param param     parameter is not used, keep for backwards compatibility
 * @return
 *      - ESP_OK on success
 *      - ESP_ERR_NOT_SUPPORTED if it is not implemented
--- a/Libraries/esp_lvgl_port/include/esp_lvgl_port_button.h
+++ b/Libraries/esp_lvgl_port/include/esp_lvgl_port_button.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -32,10 +32,16 @@ extern "C" {
 * @brief Configuration of the navigation buttons structure
 */
 typedef struct {
-    lv_display_t *disp;                /*!< LVGL display handle (returned from lvgl_port_add_disp) */
+    lv_display_t    *disp;                /*!< LVGL display handle (returned from lvgl_port_add_disp) */
 #if BUTTON_VER_MAJOR < 4
    const button_config_t *button_prev;   /*!< Navigation button for previous */
    const button_config_t *button_next;   /*!< Navigation button for next */
    const button_config_t *button_enter;  /*!< Navigation button for enter */
 #else
    button_handle_t button_prev;   /*!< Handle for navigation button for previous */
    button_handle_t button_next;   /*!< Handle for navigation button for next */
    button_handle_t button_enter;  /*!< Handle for navigation button for enter */
 #endif
 } lvgl_port_nav_btns_cfg_t;
 /**
--- a/Libraries/esp_lvgl_port/include/esp_lvgl_port_knob.h
+++ b/Libraries/esp_lvgl_port/include/esp_lvgl_port_knob.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -36,9 +36,13 @@ extern "C" {
 * @brief Configuration of the encoder structure
 */
 typedef struct {
-    lv_display_t *disp;    /*!< LVGL display handle (returned from lvgl_port_add_disp) */
+    lv_display_t        *disp;          /*!< LVGL display handle (returned from lvgl_port_add_disp) */
-    const knob_config_t *encoder_a_b;
+    const knob_config_t *encoder_a_b;   /*!< Encoder knob configuration */
 #if BUTTON_VER_MAJOR < 4
    const button_config_t *encoder_enter;  /*!< Navigation button for enter */
 #else
    button_handle_t       encoder_enter;   /*!< Handle for enter button */
 #endif
 } lvgl_port_encoder_cfg_t;
 /**
--- a/Libraries/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
+++ b/Libraries/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -32,6 +32,15 @@ extern "C" {
    _lv_color_blend_to_rgb565_esp(dsc)
 #endif
 #ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888
 #define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc, dest_px_size) \
    _lv_color_blend_to_rgb888_esp(dsc, dest_px_size)
 #endif
 #ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565
 #define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)  \
    _lv_rgb565_blend_normal_to_rgb565_esp(dsc)
 #endif
 /**********************
 *      TYPEDEFS
@ -83,6 +92,40 @@ static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_d
    return lv_color_blend_to_rgb565_esp(&asm_dsc);
 }
 extern int lv_color_blend_to_rgb888_esp(asm_dsc_t *asm_dsc);
 static inline lv_result_t _lv_color_blend_to_rgb888_esp(_lv_draw_sw_blend_fill_dsc_t *dsc, uint32_t dest_px_size)
 {
    if (dest_px_size != 3) {
        return LV_RESULT_INVALID;
    }
    asm_dsc_t asm_dsc = {
        .dst_buf = dsc->dest_buf,
        .dst_w = dsc->dest_w,
        .dst_h = dsc->dest_h,
        .dst_stride = dsc->dest_stride,
        .src_buf = &dsc->color,
    };
    return lv_color_blend_to_rgb888_esp(&asm_dsc);
 }
 extern int lv_rgb565_blend_normal_to_rgb565_esp(asm_dsc_t *asm_dsc);
 static inline lv_result_t _lv_rgb565_blend_normal_to_rgb565_esp(_lv_draw_sw_blend_image_dsc_t *dsc)
 {
    asm_dsc_t asm_dsc = {
        .dst_buf = dsc->dest_buf,
        .dst_w = dsc->dest_w,
        .dst_h = dsc->dest_h,
        .dst_stride = dsc->dest_stride,
        .src_buf = dsc->src_buf,
        .src_stride = dsc->src_stride
    };
    return lv_rgb565_blend_normal_to_rgb565_esp(&asm_dsc);
 }
 #endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM
 #ifdef __cplusplus
--- a/Libraries/esp_lvgl_port/src/lvgl8/esp_lvgl_port_button.c
+++ b/Libraries/esp_lvgl_port/src/lvgl8/esp_lvgl_port_button.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -56,6 +56,7 @@ lv_indev_t *lvgl_port_add_navigation_buttons(const lvgl_port_nav_btns_cfg_t *but
        return NULL;
    }
 #if BUTTON_VER_MAJOR < 4
    /* Previous button */
    if (buttons_cfg->button_prev != NULL) {
        buttons_ctx->btn[LVGL_PORT_NAV_BTN_PREV] = iot_button_create(buttons_cfg->button_prev);
@ -73,11 +74,23 @@ lv_indev_t *lvgl_port_add_navigation_buttons(const lvgl_port_nav_btns_cfg_t *but
        buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER] = iot_button_create(buttons_cfg->button_enter);
        ESP_GOTO_ON_FALSE(buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER], ESP_ERR_NO_MEM, err, TAG, "Not enough memory for button create!");
    }
 #else
    ESP_GOTO_ON_FALSE(buttons_cfg->button_prev && buttons_cfg->button_next && buttons_cfg->button_enter, ESP_ERR_INVALID_ARG, err, TAG, "Invalid some button handler!");
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_PREV] = buttons_cfg->button_prev;
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_NEXT] = buttons_cfg->button_next;
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER] = buttons_cfg->button_enter;
 #endif
    /* Button handlers */
    for (int i = 0; i < LVGL_PORT_NAV_BTN_CNT; i++) {
 #if BUTTON_VER_MAJOR < 4
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_DOWN, lvgl_port_btn_down_handler, buttons_ctx));
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_UP, lvgl_port_btn_up_handler, buttons_ctx));
 #else
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_DOWN, NULL, lvgl_port_btn_down_handler, buttons_ctx));
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_UP, NULL, lvgl_port_btn_up_handler, buttons_ctx));
 #endif
    }
    buttons_ctx->btn_prev = false;
--- a/Libraries/esp_lvgl_port/src/lvgl8/esp_lvgl_port_knob.c
+++ b/Libraries/esp_lvgl_port/src/lvgl8/esp_lvgl_port_knob.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -19,7 +19,8 @@ typedef struct {
    knob_handle_t   knob_handle; /* Encoder knob handlers */
    button_handle_t btn_handle; /* Encoder button handlers */
    lv_indev_drv_t  indev_drv;  /* LVGL input device driver */
-    bool btn_enter; /* Encoder button enter state */
+    bool btn_enter;     /* Encoder button enter state */
    int32_t diff;       /* Encoder diff */
 } lvgl_port_encoder_ctx_t;
 /*******************************************************************************
@ -29,6 +30,9 @@ typedef struct {
 static void lvgl_port_encoder_read(lv_indev_drv_t *indev_drv, lv_indev_data_t *data);
 static void lvgl_port_encoder_btn_down_handler(void *arg, void *arg2);
 static void lvgl_port_encoder_btn_up_handler(void *arg, void *arg2);
 static void lvgl_port_encoder_left_handler(void *arg, void *arg2);
 static void lvgl_port_encoder_right_handler(void *arg, void *arg2);
 static int32_t lvgl_port_calculate_diff(knob_handle_t knob, knob_event_t event);
 /*******************************************************************************
 * Public API functions
@ -54,16 +58,30 @@ lv_indev_t *lvgl_port_add_encoder(const lvgl_port_encoder_cfg_t *encoder_cfg)
        ESP_GOTO_ON_FALSE(encoder_ctx->knob_handle, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for knob create!");
    }
    ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_LEFT, lvgl_port_encoder_left_handler, encoder_ctx));
    ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_RIGHT, lvgl_port_encoder_right_handler, encoder_ctx));
    /* Encoder Enter */
    if (encoder_cfg->encoder_enter != NULL) {
 #if BUTTON_VER_MAJOR < 4
        encoder_ctx->btn_handle = iot_button_create(encoder_cfg->encoder_enter);
        ESP_GOTO_ON_FALSE(encoder_ctx->btn_handle, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for button create!");
 #else
        ESP_GOTO_ON_FALSE(encoder_cfg->encoder_enter, ESP_ERR_INVALID_ARG, err, TAG, "Invalid button handler!");
        encoder_ctx->btn_handle = encoder_cfg->encoder_enter;
 #endif
    }
 #if BUTTON_VER_MAJOR < 4
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_DOWN, lvgl_port_encoder_btn_down_handler, encoder_ctx));
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_UP, lvgl_port_encoder_btn_up_handler, encoder_ctx));
 #else
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_DOWN, NULL, lvgl_port_encoder_btn_down_handler, encoder_ctx));
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_UP, NULL, lvgl_port_encoder_btn_up_handler, encoder_ctx));
 #endif
    encoder_ctx->btn_enter = false;
    encoder_ctx->diff = 0;
    /* Register a encoder input device */
    lv_indev_drv_init(&encoder_ctx->indev_drv);
@ -118,22 +136,13 @@ esp_err_t lvgl_port_remove_encoder(lv_indev_t *encoder)
 static void lvgl_port_encoder_read(lv_indev_drv_t *indev_drv, lv_indev_data_t *data)
 {
    static int32_t last_v = 0;
    assert(indev_drv);
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *)indev_drv->user_data;
    assert(ctx);
-    int32_t invd = iot_knob_get_count_value(ctx->knob_handle);
+    data->enc_diff = ctx->diff;
    knob_event_t event = iot_knob_get_event(ctx->knob_handle);
    if (last_v ^ invd) {
        last_v = invd;
        data->enc_diff = (KNOB_LEFT == event) ? (-1) : ((KNOB_RIGHT == event) ? (1) : (0));
    } else {
        data->enc_diff = 0;
    }
    data->state = (true == ctx->btn_enter) ? LV_INDEV_STATE_PRESSED : LV_INDEV_STATE_RELEASED;
    ctx->diff = 0;
 }
 static void lvgl_port_encoder_btn_down_handler(void *arg, void *arg2)
@ -159,3 +168,47 @@ static void lvgl_port_encoder_btn_up_handler(void *arg, void *arg2)
        }
    }
 }
 static void lvgl_port_encoder_left_handler(void *arg, void *arg2)
 {
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
    knob_handle_t knob = (knob_handle_t)arg;
    if (ctx && knob) {
        /* LEFT */
        if (knob == ctx->knob_handle) {
            int32_t diff = lvgl_port_calculate_diff(knob, KNOB_LEFT);
            ctx->diff = (ctx->diff > 0) ? diff : ctx->diff + diff;
        }
    }
 }
 static void lvgl_port_encoder_right_handler(void *arg, void *arg2)
 {
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
    knob_handle_t knob = (knob_handle_t)arg;
    if (ctx && knob) {
        /* RIGHT */
        if (knob == ctx->knob_handle) {
            int32_t diff = lvgl_port_calculate_diff(knob, KNOB_RIGHT);
            ctx->diff = (ctx->diff < 0) ? diff : ctx->diff + diff;
        }
    }
 }
 static int32_t lvgl_port_calculate_diff(knob_handle_t knob, knob_event_t event)
 {
    static int32_t last_v = 0;
    int32_t diff = 0;
    int32_t invd = iot_knob_get_count_value(knob);
    if (last_v ^ invd) {
        diff = (int32_t)((uint32_t)invd - (uint32_t)last_v);
        diff += (event == KNOB_RIGHT && invd < last_v) ? CONFIG_KNOB_HIGH_LIMIT :
                (event == KNOB_LEFT && invd > last_v) ? CONFIG_KNOB_LOW_LIMIT : 0;
        last_v = invd;
    }
    return diff;
 }
--- a/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port.c
+++ b/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -14,6 +14,7 @@
 #include "freertos/portmacro.h"
 #include "freertos/task.h"
 #include "freertos/semphr.h"
 #include "freertos/event_groups.h"
 #include "esp_lvgl_port.h"
 #include "esp_lvgl_port_priv.h"
 #include "lvgl.h"
@ -30,7 +31,7 @@ typedef struct lvgl_port_ctx_s {
    TaskHandle_t        lvgl_task;
    SemaphoreHandle_t   lvgl_mux;
    SemaphoreHandle_t   timer_mux;
-    QueueHandle_t       lvgl_queue;
+    EventGroupHandle_t  lvgl_events;
    SemaphoreHandle_t   task_init_mux;
    esp_timer_handle_t  tick_timer;
    bool                running;
@ -79,17 +80,22 @@ esp_err_t lvgl_port_init(const lvgl_port_cfg_t *cfg)
    lvgl_port_ctx.task_init_mux = xSemaphoreCreateMutex();
    ESP_GOTO_ON_FALSE(lvgl_port_ctx.task_init_mux, ESP_ERR_NO_MEM, err, TAG, "Create LVGL task sem fail!");
    /* Task queue */
-    lvgl_port_ctx.lvgl_queue = xQueueCreate(100, sizeof(lvgl_port_event_t));
+    lvgl_port_ctx.lvgl_events = xEventGroupCreate();
-    ESP_GOTO_ON_FALSE(lvgl_port_ctx.lvgl_queue, ESP_ERR_NO_MEM, err, TAG, "Create LVGL queue fail!");
+    ESP_GOTO_ON_FALSE(lvgl_port_ctx.lvgl_events, ESP_ERR_NO_MEM, err, TAG, "Create LVGL Event Group fail!");
    BaseType_t res;
    if (cfg->task_affinity < 0) {
-        res = xTaskCreate(lvgl_port_task, "taskLVGL", cfg->task_stack, NULL, cfg->task_priority, &lvgl_port_ctx.lvgl_task);
+        res = xTaskCreate(lvgl_port_task, "taskLVGL", cfg->task_stack, xTaskGetCurrentTaskHandle(), cfg->task_priority, &lvgl_port_ctx.lvgl_task);
    } else {
-        res = xTaskCreatePinnedToCore(lvgl_port_task, "taskLVGL", cfg->task_stack, NULL, cfg->task_priority, &lvgl_port_ctx.lvgl_task, cfg->task_affinity);
+        res = xTaskCreatePinnedToCore(lvgl_port_task, "taskLVGL", cfg->task_stack, xTaskGetCurrentTaskHandle(), cfg->task_priority, &lvgl_port_ctx.lvgl_task, cfg->task_affinity);
    }
    ESP_GOTO_ON_FALSE(res == pdPASS, ESP_FAIL, err, TAG, "Create LVGL task fail!");
    // Wait until taskLVGL starts
    if (ulTaskNotifyTake(pdTRUE, pdMS_TO_TICKS(5000)) == 0) {
        ret = ESP_ERR_TIMEOUT;
    }
 err:
    if (ret != ESP_OK) {
        lvgl_port_deinit();
@ -164,23 +170,30 @@ void lvgl_port_unlock(void)
 esp_err_t lvgl_port_task_wake(lvgl_port_event_type_t event, void *param)
 {
-    if (!lvgl_port_ctx.lvgl_queue) {
+    EventBits_t bits = 0;
    if (!lvgl_port_ctx.lvgl_events) {
        return ESP_ERR_INVALID_STATE;
    }
-    lvgl_port_event_t ev = {
+    /* Get unprocessed bits */
-        .type = event,
+    if (xPortInIsrContext() == pdTRUE) {
-        .param = param,
+        bits = xEventGroupGetBitsFromISR(lvgl_port_ctx.lvgl_events);
-    };
+    } else {
        bits = xEventGroupGetBits(lvgl_port_ctx.lvgl_events);
    }
    /* Set event */
    bits |= event;
    /* Save */
    if (xPortInIsrContext() == pdTRUE) {
        BaseType_t xHigherPriorityTaskWoken = pdFALSE;
-        xQueueSendFromISR(lvgl_port_ctx.lvgl_queue, &ev, &xHigherPriorityTaskWoken);
+        xEventGroupSetBitsFromISR(lvgl_port_ctx.lvgl_events, bits, &xHigherPriorityTaskWoken);
        if (xHigherPriorityTaskWoken) {
            portYIELD_FROM_ISR( );
        }
    } else {
-        xQueueSend(lvgl_port_ctx.lvgl_queue, &ev, 0);
+        xEventGroupSetBits(lvgl_port_ctx.lvgl_events, bits);
    }
    return ESP_OK;
@ -206,7 +219,8 @@ IRAM_ATTR bool lvgl_port_task_notify(uint32_t value)
 static void lvgl_port_task(void *arg)
 {
-    lvgl_port_event_t event;
+    TaskHandle_t task_to_notify = (TaskHandle_t)arg;
    EventBits_t events = 0;
    uint32_t task_delay_ms = 0;
    lv_indev_t *indev = NULL;
@ -219,6 +233,8 @@ static void lvgl_port_task(void *arg)
    /* LVGL init */
    lv_init();
    /* LVGL is initialized, notify lvgl_port_init() function about it */
    xTaskNotifyGive(task_to_notify);
    /* Tick init */
    lvgl_port_tick_init();
@ -227,21 +243,17 @@ static void lvgl_port_task(void *arg)
    while (lvgl_port_ctx.running) {
        /* Wait for queue or timeout (sleep task) */
        TickType_t wait = (pdMS_TO_TICKS(task_delay_ms) >= 1 ? pdMS_TO_TICKS(task_delay_ms) : 1);
-        xQueueReceive(lvgl_port_ctx.lvgl_queue, &event, wait);
+        events = xEventGroupWaitBits(lvgl_port_ctx.lvgl_events, 0xFF, pdTRUE, pdFALSE, wait);
        if (lv_display_get_default() && lvgl_port_lock(0)) {
            /* Call read input devices */
-            if (event.type == LVGL_PORT_EVENT_TOUCH) {
+            if (events & LVGL_PORT_EVENT_TOUCH) {
                xSemaphoreTake(lvgl_port_ctx.timer_mux, portMAX_DELAY);
-                if (event.param != NULL) {
+                indev = lv_indev_get_next(NULL);
-                    lv_indev_read(event.param);
+                while (indev != NULL) {
-                } else {
+                    lv_indev_read(indev);
-                    indev = lv_indev_get_next(NULL);
+                    indev = lv_indev_get_next(indev);
                    while (indev != NULL) {
                        lv_indev_read(indev);
                        indev = lv_indev_get_next(indev);
                    }
                }
                xSemaphoreGive(lvgl_port_ctx.timer_mux);
            }
@ -279,8 +291,8 @@ static void lvgl_port_task_deinit(void)
    if (lvgl_port_ctx.task_init_mux) {
        vSemaphoreDelete(lvgl_port_ctx.task_init_mux);
    }
-    if (lvgl_port_ctx.lvgl_queue) {
+    if (lvgl_port_ctx.lvgl_events) {
-        vQueueDelete(lvgl_port_ctx.lvgl_queue);
+        vEventGroupDelete(lvgl_port_ctx.lvgl_events);
    }
    memset(&lvgl_port_ctx, 0, sizeof(lvgl_port_ctx));
 #if LV_ENABLE_GC || !LV_MEM_CUSTOM
--- a/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_button.c
+++ b/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_button.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -56,6 +56,7 @@ lv_indev_t *lvgl_port_add_navigation_buttons(const lvgl_port_nav_btns_cfg_t *but
        return NULL;
    }
 #if BUTTON_VER_MAJOR < 4
    /* Previous button */
    if (buttons_cfg->button_prev != NULL) {
        buttons_ctx->btn[LVGL_PORT_NAV_BTN_PREV] = iot_button_create(buttons_cfg->button_prev);
@ -73,11 +74,23 @@ lv_indev_t *lvgl_port_add_navigation_buttons(const lvgl_port_nav_btns_cfg_t *but
        buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER] = iot_button_create(buttons_cfg->button_enter);
        ESP_GOTO_ON_FALSE(buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER], ESP_ERR_NO_MEM, err, TAG, "Not enough memory for button create!");
    }
 #else
    ESP_GOTO_ON_FALSE(buttons_cfg->button_prev && buttons_cfg->button_next && buttons_cfg->button_enter, ESP_ERR_INVALID_ARG, err, TAG, "Invalid some button handler!");
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_PREV] = buttons_cfg->button_prev;
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_NEXT] = buttons_cfg->button_next;
    buttons_ctx->btn[LVGL_PORT_NAV_BTN_ENTER] = buttons_cfg->button_enter;
 #endif
    /* Button handlers */
    for (int i = 0; i < LVGL_PORT_NAV_BTN_CNT; i++) {
 #if BUTTON_VER_MAJOR < 4
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_DOWN, lvgl_port_btn_down_handler, buttons_ctx));
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_UP, lvgl_port_btn_up_handler, buttons_ctx));
 #else
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_DOWN, NULL, lvgl_port_btn_down_handler, buttons_ctx));
        ESP_ERROR_CHECK(iot_button_register_cb(buttons_ctx->btn[i], BUTTON_PRESS_UP, NULL, lvgl_port_btn_up_handler, buttons_ctx));
 #endif
    }
    buttons_ctx->btn_prev = false;
--- a/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_disp.c
+++ b/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_disp.c
@ -251,6 +251,7 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
    ESP_RETURN_ON_FALSE(disp_cfg->color_format == 0 || disp_cfg->color_format == LV_COLOR_FORMAT_RGB565 || disp_cfg->color_format == LV_COLOR_FORMAT_RGB888 || disp_cfg->color_format == LV_COLOR_FORMAT_XRGB8888 || disp_cfg->color_format == LV_COLOR_FORMAT_ARGB8888 || disp_cfg->color_format == LV_COLOR_FORMAT_I1, NULL, TAG, "Not supported display color format!");
    lv_color_format_t display_color_format = (disp_cfg->color_format != 0 ? disp_cfg->color_format : LV_COLOR_FORMAT_RGB565);
    uint8_t color_bytes = lv_color_format_get_size(display_color_format);
    if (disp_cfg->flags.swap_bytes) {
        /* Swap bytes can be used only in RGB565 color format */
        ESP_RETURN_ON_FALSE(display_color_format == LV_COLOR_FORMAT_RGB565, NULL, TAG, "Swap bytes can be used only in display color format RGB565!");
@ -258,7 +259,7 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
    if (disp_cfg->flags.buff_dma) {
        /* DMA buffer can be used only in RGB565 color format */
-        ESP_RETURN_ON_FALSE(display_color_format == LV_COLOR_FORMAT_RGB565, NULL, TAG, "DMA buffer can be used only in display color format RGB565 (not alligned copy)!");
+        ESP_RETURN_ON_FALSE(display_color_format == LV_COLOR_FORMAT_RGB565, NULL, TAG, "DMA buffer can be used only in display color format RGB565 (not aligned copy)!");
    }
    /* Display context */
@ -307,10 +308,10 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
    } else {
        /* alloc draw buffers used by LVGL */
        /* it's recommended to choose the size of the draw buffer(s) to be at least 1/10 screen sized */
-        buf1 = heap_caps_malloc(buffer_size * sizeof(lv_color_t), buff_caps);
+        buf1 = heap_caps_malloc(buffer_size * color_bytes, buff_caps);
        ESP_GOTO_ON_FALSE(buf1, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for LVGL buffer (buf1) allocation!");
        if (disp_cfg->double_buffer) {
-            buf2 = heap_caps_malloc(buffer_size * sizeof(lv_color_t), buff_caps);
+            buf2 = heap_caps_malloc(buffer_size * color_bytes, buff_caps);
            ESP_GOTO_ON_FALSE(buf2, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for LVGL buffer (buf2) allocation!");
        }
@ -336,7 +337,7 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
        ESP_GOTO_ON_FALSE((disp_cfg->hres * disp_cfg->vres == buffer_size), ESP_ERR_INVALID_ARG, err, TAG, "Monochromatic display must using full buffer!");
        disp_ctx->flags.monochrome = 1;
-        lv_display_set_buffers(disp, buf1, buf2, buffer_size * sizeof(lv_color_t), LV_DISPLAY_RENDER_MODE_FULL);
+        lv_display_set_buffers(disp, buf1, buf2, buffer_size * color_bytes, LV_DISPLAY_RENDER_MODE_FULL);
        if (display_color_format == LV_COLOR_FORMAT_I1) {
            /* OLED monochrome buffer */
@ -350,15 +351,15 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
        ESP_GOTO_ON_FALSE((disp_cfg->hres * disp_cfg->vres == buffer_size), ESP_ERR_INVALID_ARG, err, TAG, "Direct mode must using full buffer!");
        disp_ctx->flags.direct_mode = 1;
-        lv_display_set_buffers(disp, buf1, buf2, buffer_size * sizeof(lv_color_t), LV_DISPLAY_RENDER_MODE_DIRECT);
+        lv_display_set_buffers(disp, buf1, buf2, buffer_size * color_bytes, LV_DISPLAY_RENDER_MODE_DIRECT);
    } else if (disp_cfg->flags.full_refresh) {
        /* When using full_refresh, there must be used full bufer! */
        ESP_GOTO_ON_FALSE((disp_cfg->hres * disp_cfg->vres == buffer_size), ESP_ERR_INVALID_ARG, err, TAG, "Full refresh must using full buffer!");
        disp_ctx->flags.full_refresh = 1;
-        lv_display_set_buffers(disp, buf1, buf2, buffer_size * sizeof(lv_color_t), LV_DISPLAY_RENDER_MODE_FULL);
+        lv_display_set_buffers(disp, buf1, buf2, buffer_size * color_bytes, LV_DISPLAY_RENDER_MODE_FULL);
    } else {
-        lv_display_set_buffers(disp, buf1, buf2, buffer_size * sizeof(lv_color_t), LV_DISPLAY_RENDER_MODE_PARTIAL);
+        lv_display_set_buffers(disp, buf1, buf2, buffer_size * color_bytes, LV_DISPLAY_RENDER_MODE_PARTIAL);
    }
    lv_display_set_flush_cb(disp, lvgl_port_flush_callback);
@ -371,7 +372,7 @@ static lv_display_t *lvgl_port_add_disp_priv(const lvgl_port_display_cfg_t *disp
    /* Use SW rotation */
    if (disp_cfg->flags.sw_rotate) {
-        disp_ctx->draw_buffs[2] = heap_caps_malloc(buffer_size * sizeof(lv_color_t), buff_caps);
+        disp_ctx->draw_buffs[2] = heap_caps_malloc(buffer_size * color_bytes, buff_caps);
        ESP_GOTO_ON_FALSE(disp_ctx->draw_buffs[2], ESP_ERR_NO_MEM, err, TAG, "Not enough memory for LVGL buffer (rotation buffer) allocation!");
    }
@ -567,7 +568,7 @@ static void lvgl_port_flush_callback(lv_display_t *drv, const lv_area_t *area, u
    int offsety2 = area->y2;
    /* SW rotation enabled */
-    if (disp_ctx->flags.sw_rotate && (disp_ctx->current_rotation > LV_DISPLAY_ROTATION_0 || disp_ctx->flags.swap_bytes)) {
+    if (disp_ctx->flags.sw_rotate && (disp_ctx->current_rotation > LV_DISPLAY_ROTATION_0)) {
        /* SW rotation */
        if (disp_ctx->draw_buffs[2]) {
            int32_t ww = lv_area_get_width(area);
@ -589,7 +590,9 @@ static void lvgl_port_flush_callback(lv_display_t *drv, const lv_area_t *area, u
            offsety1 = area->y1;
            offsety2 = area->y2;
        }
-    } else if (disp_ctx->flags.swap_bytes) {
+    }
    if (disp_ctx->flags.swap_bytes) {
        size_t len = lv_area_get_size(area);
        lv_draw_sw_rgb565_swap(color_map, len);
    }
--- a/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_knob.c
+++ b/Libraries/esp_lvgl_port/src/lvgl9/esp_lvgl_port_knob.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -20,6 +20,7 @@ typedef struct {
    button_handle_t btn_handle;     /* Encoder button handlers */
    lv_indev_t      *indev;         /* LVGL input device driver */
    bool btn_enter;                 /* Encoder button enter state */
    int32_t diff;                   /* Encoder diff */
 } lvgl_port_encoder_ctx_t;
 /*******************************************************************************
@ -27,9 +28,11 @@ typedef struct {
 *******************************************************************************/
 static void lvgl_port_encoder_read(lv_indev_t *indev_drv, lv_indev_data_t *data);
-static void lvgl_port_encoder_btn_down_handler(void *arg, void *arg2);
+static void lvgl_port_encoder_btn_down_handler(void *button_handle, void *usr_data);
-static void lvgl_port_encoder_btn_up_handler(void *arg, void *arg2);
+static void lvgl_port_encoder_btn_up_handler(void *button_handle, void *usr_data);
-static void lvgl_port_encoder_knob_handler(void *arg, void *arg2);
+static void lvgl_port_encoder_left_handler(void *arg, void *arg2);
 static void lvgl_port_encoder_right_handler(void *arg, void *arg2);
 static int32_t lvgl_port_calculate_diff(knob_handle_t knob, knob_event_t event);
 /*******************************************************************************
 * Public API functions
@ -54,20 +57,31 @@ lv_indev_t *lvgl_port_add_encoder(const lvgl_port_encoder_cfg_t *encoder_cfg)
        encoder_ctx->knob_handle = iot_knob_create(encoder_cfg->encoder_a_b);
        ESP_GOTO_ON_FALSE(encoder_ctx->knob_handle, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for knob create!");
-        ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_LEFT, lvgl_port_encoder_knob_handler, encoder_ctx));
+        ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_LEFT, lvgl_port_encoder_left_handler, encoder_ctx));
-        ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_RIGHT, lvgl_port_encoder_knob_handler, encoder_ctx));
+        ESP_ERROR_CHECK(iot_knob_register_cb(encoder_ctx->knob_handle, KNOB_RIGHT, lvgl_port_encoder_right_handler, encoder_ctx));
    }
    /* Encoder Enter */
    if (encoder_cfg->encoder_enter != NULL) {
 #if BUTTON_VER_MAJOR < 4
        encoder_ctx->btn_handle = iot_button_create(encoder_cfg->encoder_enter);
        ESP_GOTO_ON_FALSE(encoder_ctx->btn_handle, ESP_ERR_NO_MEM, err, TAG, "Not enough memory for button create!");
 #else
        ESP_GOTO_ON_FALSE(encoder_cfg->encoder_enter, ESP_ERR_INVALID_ARG, err, TAG, "Invalid button handler!");
        encoder_ctx->btn_handle = encoder_cfg->encoder_enter;
 #endif
    }
 #if BUTTON_VER_MAJOR < 4
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_DOWN, lvgl_port_encoder_btn_down_handler, encoder_ctx));
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_UP, lvgl_port_encoder_btn_up_handler, encoder_ctx));
 #else
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_DOWN, NULL, lvgl_port_encoder_btn_down_handler, encoder_ctx));
    ESP_ERROR_CHECK(iot_button_register_cb(encoder_ctx->btn_handle, BUTTON_PRESS_UP, NULL, lvgl_port_encoder_btn_up_handler, encoder_ctx));
 #endif
    encoder_ctx->btn_enter = false;
    encoder_ctx->diff = 0;
    lvgl_port_lock(0);
    /* Register a encoder input device */
@ -130,27 +144,19 @@ esp_err_t lvgl_port_remove_encoder(lv_indev_t *encoder)
 static void lvgl_port_encoder_read(lv_indev_t *indev_drv, lv_indev_data_t *data)
 {
    static int32_t last_v = 0;
    assert(indev_drv);
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *)lv_indev_get_driver_data(indev_drv);
    assert(ctx);
-    int32_t invd = iot_knob_get_count_value(ctx->knob_handle);
+    data->enc_diff = ctx->diff;
    knob_event_t event = iot_knob_get_event(ctx->knob_handle);
    if (last_v ^ invd) {
        last_v = invd;
        data->enc_diff = (KNOB_LEFT == event) ? (-1) : ((KNOB_RIGHT == event) ? (1) : (0));
    } else {
        data->enc_diff = 0;
    }
    data->state = (true == ctx->btn_enter) ? LV_INDEV_STATE_PRESSED : LV_INDEV_STATE_RELEASED;
    ctx->diff = 0;
 }
-static void lvgl_port_encoder_btn_down_handler(void *arg, void *arg2)
+static void lvgl_port_encoder_btn_down_handler(void *button_handle, void *usr_data)
 {
-    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
+    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) usr_data;
-    button_handle_t button = (button_handle_t)arg;
+    button_handle_t button = (button_handle_t)button_handle;
    if (ctx && button) {
        /* ENTER */
        if (button == ctx->btn_handle) {
@ -162,10 +168,10 @@ static void lvgl_port_encoder_btn_down_handler(void *arg, void *arg2)
    lvgl_port_task_wake(LVGL_PORT_EVENT_TOUCH, ctx->indev);
 }
-static void lvgl_port_encoder_btn_up_handler(void *arg, void *arg2)
+static void lvgl_port_encoder_btn_up_handler(void *button_handle, void *usr_data)
 {
-    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
+    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) usr_data;
-    button_handle_t button = (button_handle_t)arg;
+    button_handle_t button = (button_handle_t)button_handle;
    if (ctx && button) {
        /* ENTER */
        if (button == ctx->btn_handle) {
@ -177,9 +183,51 @@ static void lvgl_port_encoder_btn_up_handler(void *arg, void *arg2)
    lvgl_port_task_wake(LVGL_PORT_EVENT_TOUCH, ctx->indev);
 }
-static void lvgl_port_encoder_knob_handler(void *arg, void *arg2)
+static void lvgl_port_encoder_left_handler(void *arg, void *arg2)
 {
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
-    /* Wake LVGL task, if needed */
+    knob_handle_t knob = (knob_handle_t)arg;
-    lvgl_port_task_wake(LVGL_PORT_EVENT_TOUCH, ctx->indev);
+    if (ctx && knob) {
        /* LEFT */
        if (knob == ctx->knob_handle) {
            int32_t diff = lvgl_port_calculate_diff(knob, KNOB_LEFT);
            ctx->diff = (ctx->diff > 0) ? diff : ctx->diff + diff;
        }
        /* Wake LVGL task, if needed */
        lvgl_port_task_wake(LVGL_PORT_EVENT_TOUCH, ctx->indev);
    }
 }
 static void lvgl_port_encoder_right_handler(void *arg, void *arg2)
 {
    lvgl_port_encoder_ctx_t *ctx = (lvgl_port_encoder_ctx_t *) arg2;
    knob_handle_t knob = (knob_handle_t)arg;
    if (ctx && knob) {
        /* RIGHT */
        if (knob == ctx->knob_handle) {
            int32_t diff = lvgl_port_calculate_diff(knob, KNOB_RIGHT);
            ctx->diff = (ctx->diff < 0) ? diff : ctx->diff + diff;
        }
        /* Wake LVGL task, if needed */
        lvgl_port_task_wake(LVGL_PORT_EVENT_TOUCH, ctx->indev);
    }
 }
 static int32_t lvgl_port_calculate_diff(knob_handle_t knob, knob_event_t event)
 {
    static int32_t last_v = 0;
    int32_t diff = 0;
    int32_t invd = iot_knob_get_count_value(knob);
    if (last_v ^ invd) {
        diff = (int32_t)((uint32_t)invd - (uint32_t)last_v);
        diff += (event == KNOB_RIGHT && invd < last_v) ? CONFIG_KNOB_HIGH_LIMIT :
                (event == KNOB_LEFT && invd > last_v) ? CONFIG_KNOB_LOW_LIMIT : 0;
        last_v = invd;
    }
    return diff;
 }
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
@ -32,8 +32,7 @@
 lv_color_blend_to_argb8888_esp:
-    entry      a1,    32
+    entry    a1,    32
    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
@ -31,8 +31,7 @@
 lv_color_blend_to_rgb565_esp:
-    entry      a1,    32
+    entry    a1,    32
    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
@ -0,0 +1,105 @@
 /*
 * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 // This is LVGL RGB888 simple fill for ESP32 processor
    .section .text
    .align  4
    .global lv_color_blend_to_rgb888_esp
    .type   lv_color_blend_to_rgb888_esp,@function
 // The function implements the following C code:
 // void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc);
 // Input params
 //
 // dsc - a2
 // typedef struct {
 //     uint32_t opa;                l32i    0
 //     void * dst_buf;              l32i    4
 //     uint32_t dst_w;              l32i    8
 //     uint32_t dst_h;              l32i    12
 //     uint32_t dst_stride;         l32i    16
 //     const void * src_buf;        l32i    20
 //     uint32_t src_stride;         l32i    24
 //     const lv_opa_t * mask_buf;   l32i    28
 //     uint32_t mask_stride;        l32i    32
 // } asm_dsc_t;
 lv_color_blend_to_rgb888_esp:
    entry    a1,    32
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint24_t
    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
    l32i.n   a8,    a7,    0                    // a8 - color as value
    // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
    add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
    // Prepare register combinations
    // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
    l8ui     a13,   a7,    0                     // blue     000B
    slli     a13,   a13,   24                    // shift to B000
    or       a13,   a13,   a8                    // a13      BRGB
    srli     a14,   a8,    8                     // a14      00RG
    slli     a10,   a8,    16                    // a10      GB00
    or       a14,   a14,   a10                   // a14      GBRG
    slli     a15,   a8,    8                     // a15      RGB0
    l8ui     a10,   a7,    2                     // a7       000R
    or       a15,   a15,   a10                   // a15      RGBR
    sub      a6,     a6,    a11                  // dest_stride = dest_stride - dest_w_bytes
    // Prepare main loop length and dest_w_bytes
    srli     a9,     a4,    2                    // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w
    movi.n   a8,     0x3                         // a8 = 0x3, remainder mask
    and      a10,    a4,    a8                   // a10 - remainder after division by 4 = a4 and 0x3
    .outer_loop:
        // Run main loop which sets 12 bytes (4 rgb888) in one loop run
        loopnez a9, ._main_loop
            s32i.n      a13,  a3,  0                    // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0
            s32i.n      a14,  a3,  4                    // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4
            s32i.n      a15,  a3,  8                    // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8
            addi.n      a3,   a3,  12                   // increment dest_buff pointer by 12
        ._main_loop:
        bnei   a10,  0x3,  _less_than_3                 // branch if less than 3 values left
            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
            s32i.n      a14,  a3,  4                    // save 32 bits from a14 to dest_buff a3, offset 4 bytes
            s8i         a15,  a3,  8                    // save  8 bits from a15 to dest_buff a3, offset 8 bytes
            addi.n      a3,   a3,  9                    // increment dest_buff pointer by 9 bytes
            j           _less_than_1
        _less_than_3:
        bnei  a10,  0x2,  _less_than_2                  // branch if less than 2 values left
            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
            s16i        a14,  a3,  4                    // save 16 bits from a14 to dest_buff a3, offset 4 bytes
            addi.n      a3,   a3,  6                    // increment dest_buff pointer by 6 bytes
            j           _less_than_1
        _less_than_2:
        bnei  a10,  0x1,  _less_than_1                  // branch if less than 1 value left
            s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
            s8i         a15,  a3,  2                    // save  8 bits from a15 to dest_buff a3, offset 2 bytes
            addi.n      a3,   a3,  3                    // increment dest_buff pointer by 3 bytes
        _less_than_1:
        add     a3,  a3,  a6                            // dest_buff + dest_stride
        addi.n  a5,  a5,  -1                            // decrease the outer loop
        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (check if the address is 4-byte aligned)
    bnez a5, .outer_loop
    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
    retw.n                                              // return
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
@ -0,0 +1,346 @@
 /*
 * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 // This is LVGL RGB888 simple fill for ESP32S3 processor
    .section .text
    .align  4
    .global lv_color_blend_to_rgb888_esp
    .type   lv_color_blend_to_rgb888_esp,@function
 // The function implements the following C code:
 // void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc);
 // Input params
 //
 // dsc - a2
 // typedef struct {
 //     uint32_t opa;                l32i    0
 //     void * dst_buf;              l32i    4
 //     uint32_t dst_w;              l32i    8
 //     uint32_t dst_h;              l32i    12
 //     uint32_t dst_stride;         l32i    16
 //     const void * src_buf;        l32i    20
 //     uint32_t src_stride;         l32i    24
 //     const lv_opa_t * mask_buf;   l32i    28
 //     uint32_t mask_stride;        l32i    32
 // } asm_dsc_t;
 lv_color_blend_to_rgb888_esp:
    entry    a1,    32
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint24_t
    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
    l32i.n   a8,    a7,    0                    // a8 - color as value
    // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
    slli     a11,   a4,    1                    // a11 - dest_w_bytes = 2 * dest_w
    add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
    // Prepare register combinations
    // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
    l8ui     a13,   a7,    0                     // blue     000B
    slli     a13,   a13,   24                    // shift to B000
    or       a13,   a13,   a8                    // a13      BRGB
    srli     a14,   a8,    8                     // a14      00RG
    slli     a10,   a8,    16                    // a10      GB00
    or       a14,   a14,   a10                   // a14      GBRG
    slli     a15,   a8,    8                     // a15      RGB0
    l8ui     a10,   a7,    2                     // a7       000R
    or       a15,   a15,   a10                   // a15      RGBR
    sub      a6,    a6,    a11                   // dest_stride = dest_stride - dest_w_bytes
    // Check for short lengths
    // dest_w should be at least 12, othewise it's not worth using esp32s3 TIE
    bgei     a4,   12,  _esp32s3_implementation         // Branch if dest_w is greater than or equal to 12
    j .lv_color_blend_to_rgb888_esp32_body              // Jump to esp32 implementation
    _esp32s3_implementation:
    // Prepare q registers for the main loop
    ee.movi.32.q   q3,   a13,  0                        // fill q3 register from a13 by 32 bits
    ee.movi.32.q   q3,   a14,  1                        // fill q3 register from a14 by 32 bits
    ee.movi.32.q   q3,   a15,  2                        // fill q3 register from a15 by 32 bits
    ee.movi.32.q   q3,   a13,  3                        // fill q3 register from a13 by 32 bits
    ee.movi.32.q   q4,   a14,  0                        // fill q4 register from a14 by 32 bits
    ee.movi.32.q   q4,   a15,  1                        // fill q4 register from a15 by 32 bits
    ee.movi.32.q   q4,   a13,  2                        // fill q4 register from a13 by 32 bits
    ee.movi.32.q   q4,   a14,  3                        // fill q4 register from a14 by 32 bits
    ee.movi.32.q   q5,   a15,  0                        // fill q5 register from a15 by 32 bits
    ee.movi.32.q   q5,   a13,  1                        // fill q5 register from a13 by 32 bits
    ee.movi.32.q   q5,   a14,  2                        // fill q5 register from a14 by 32 bits
    ee.movi.32.q   q5,   a15,  3                        // fill q5 register from a15 by 32 bits
    .outer_loop_aligned:
        // q registers will get shifted and clobbered, need to reinitialize them before using them again
        // Clear q registers
        ee.zero.q      q0                                   // clear q0
        ee.zero.q      q1                                   // clear q1
        ee.zero.q      q2                                   // clear q2
        // Reinitialize q registers
        ee.orq         q0,   q0,   q3                       // copy q3 to q0
        ee.orq         q1,   q1,   q4                       // copy q4 to q1
        ee.orq         q2,   q2,   q5                       // copy q5 to q2
        // alignment check
        extui   a8,    a3,  0,  4                           // address_alignment (a8) = dest_buff address (a3) AND 0xf
        movi.n  a12,   16                           // a12 = 16
        mov.n   a2,    a8                           // unalignment (a2) = a8
        // following instruction is here to avoid branching
        // need to adjust a8 == 0 to 16 to make the unalignment computation work
        moveqz  a2,    a12,   a8                    // modified unalignment (a2) = 16 if unalignment (a8) == 0
        sub     a2,    a12,   a2                    // a2  = 16 - unalignment (lower 4 bits of dest_buff address)
        sub     a10,   a11,   a2                    // local_dest_w_bytes = len - (16 - unalignment)
        movi.n  a12,   48                           // a12 = 48 (main loop copies 48 bytes)
        quou    a9,    a10,   a12                   // main_loop counter (a9) = local_dest_w_bytes (a10) DIV 48 (a12)
        remu    a10,   a10,   a12                   // a10 = local_dest_w_bytes (a10) MOD 48 (a12)
        beqz    a8,    _dest_buff_aligned           // If already aligned, skip aligning
        movi.n  a7,    unalignment_table            // Load unalignment_table address
        addx4   a7,    a8,    a7                    // jump_table handle (a7) = offset (a8) * 4 + jump_table address (a7)
        l32i    a7,    a7,    0                     // Load target address from jump table
        jx      a7                                  // Jump to the corresponding handler
 // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
 handle_0:
 handle_1:
    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_2:
    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    s32i        a15,  a3,  0                    // save 32 bits from a15 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_3:
    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s32i        a14,  a3,  0                    // save 32 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    ee.vst.l.64.ip    q2,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_4:
    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_5:
    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_6:
    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 byte
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    ee.vst.l.64.ip    q2,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_7:
    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_8:
    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
    j _shift_q_regs
 handle_9:
    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    j _shift_q_regs
 handle_10:
    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    s32i        a15,  a3,  0                    // save 32 bits from a15 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    j _shift_q_regs
 handle_11:
    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s32i        a14,  a3,  0                    // save 32 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    j _shift_q_regs
 handle_12:
    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
    j _shift_q_regs
 handle_13:
    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    j _shift_q_regs
 handle_14:
    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
    j _shift_q_regs
 handle_15:
    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
    j _shift_q_regs
 .align 4
 unalignment_table:
    .word handle_0            // Case 0: Dummy case for easier address computation
    .word handle_1            // Case 1: Align 15 bytes
    .word handle_2            // Case 2: Align 14 bytes
    .word handle_3            // Case 3: Align 13 bytes
    .word handle_4            // Case 4: Align 12 bytes
    .word handle_5            // Case 5: Align 11 bytes
    .word handle_6            // Case 6: Align 10 bytes
    .word handle_7            // Case 7: Align 9 bytes
    .word handle_8            // Case 8: Align 8 bytes
    .word handle_9            // Case 9: Align 7 bytes
    .word handle_10           // Case 10: Align 6 bytes
    .word handle_11           // Case 11: Align 5 bytes
    .word handle_12           // Case 12: Align 4 bytes
    .word handle_13           // Case 13: Align 3 bytes
    .word handle_14           // Case 14: Align 2 bytes
    .word handle_15           // Case 15: Align 1 byte
    _shift_q_regs:
        wur.sar_byte  a2                                // apply unalignment to the SAR_BYTE
        ee.src.q      q0,   q0,   q1                    // shift concat. of q0 and q1 to q0 by SAR_BYTE amount
        ee.src.q      q1,   q1,   q2                    // shift concat. of q1 and q2 to q1 by SAR_BYTE amount
        ee.src.q      q2,   q2,   q3                    // shift concat. of q2 and q3 to q2 by SAR_BYTE amount
    _dest_buff_aligned:
        loopnez a9, ._main_loop_aligned                 // 48 bytes (16 rgb888) in one loop
            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
            ee.vst.128.ip q1, a3, 16                    // store 16 bytes from q1 to dest_buff a3
            ee.vst.128.ip q2, a3, 16                    // store 16 bytes from q2 to dest_buff a3
        ._main_loop_aligned:
        // Check modulo 32 of the unalignment, if - then set 32 bytes
        bbci      a10,   5,  .lt_32                     // branch if 5-th bit of local_dest_w_bytes a10 is clear
            ee.vst.128.ip q0,  a3,  16                  // store 16 bytes from q0 to dest_buff a3
            ee.vst.128.ip q1,  a3,  16                  // store 16 bytes from q1 to dest_buff a3
            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
        .lt_32:
        // Check modulo 16 of the unalignment, if - then set 16 bytes
        bbci      a10,   4,  .lt_16                     // branch if 4-th bit of local_dest_w_bytes a10 is clear
            ee.vst.128.ip q0,  a3,  16                  // store 16 bytes from q0 to dest_buff a3
            ee.srci.2q    q0,  q1,  0                   // shift q0 register to have next bytes to store ready from LSB
        .lt_16:
        // Check modulo 8 of the unalignment, if - then set 8 bytes
        bbci      a10,   3,  .lt_8
            ee.vst.l.64.ip q0, a3, 8                    // store 8 bytes from q0 to dest_buff a3
            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
        .lt_8:
        // Check modulo 4 of the unalignment, if - then set 4 bytes
        bbci      a10,   2,  .lt_4
            ee.movi.32.a  q0,  a2,  0                   // move lowest 32 bits of q0 to a2
            s32i.n        a2,  a3,  0                   // save 32 bits from a2 to dest_buff a3, offset 0
            addi.n        a3,  a3,  4                   // increment dest_buff pointer by 4 bytes
            ee.srci.2q    q0,  q1,  0                   // shift q0 register to have next bytes to store ready from LSB
        .lt_4:
        // Check modulo 2 of the unalignment, if - then set 2 bytes
        bbci      a10,   1,  .lt_2
            ee.movi.32.a  q0,  a2,  0                   // move lowest 32 bits of q0 to a2
            s16i          a2,  a3,  0                   // save 16 bits from a2 to dest_buff a3, offset 0
            addi.n        a3,  a3,  2                   // increment dest_buff pointer by 2 bytes
            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
        .lt_2:
        // Check modulo 1 of the unalignment, if - then set 1 byte
        bbci      a10,   0,  .lt_1
            ee.movi.32.a  q0,  a2,  0                    // move lowest 32 bits of q0 to a2
            s8i           a2,  a3,  0                    // save 8 bits from a2 to dest_buff a3, offset 0
            addi.n        a3,  a3,  1                    // increment dest_buff pointer by 1 byte
        .lt_1:
        add     a3,  a3,  a6                            // dest_buff + dest_stride
        addi.n  a5,  a5,  -1                            // decrease the outer loop
    bnez a5, .outer_loop_aligned
    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
    retw.n                                              // return
    .lv_color_blend_to_rgb888_esp32_body:
    // Prepare main loop length and dest_w_bytes
    srli     a9,     a4,    2                    // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w
    movi.n   a8,     0x3                         // a8 = 0x3, remainder mask
    and      a10,    a4,    a8                   // a10 - remainder after division by 4 = a4 & 0x3
    .outer_loop:
        // Run main loop which sets 12 bytes (4 rgb888) in one loop run
        loopnez a9, ._main_loop
            s32i.n      a13,  a3,  0                    // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0
            s32i.n      a14,  a3,  4                    // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4
            s32i.n      a15,  a3,  8                    // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8
            addi.n      a3,   a3,  12                   // increment dest_buff pointer by 12
        ._main_loop:
        bnei   a10,  0x3,  _less_than_3                 // branch if less than 3 values left
            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
            s32i.n      a14,  a3,  4                    // save 32 bits from a14 to dest_buff a3, offset 4 bytes
            s8i         a15,  a3,  8                    // save  8 bits from a15 to dest_buff a3, offset 8 bytes
            addi.n      a3,   a3,  9                    // increment dest_buff pointer by 9 bytes
            j           _less_than_1
        _less_than_3:
        bnei  a10,  0x2,  _less_than_2                  // branch if less than 2 values left
            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
            s16i        a14,  a3,  4                    // save 16 bits from a14 to dest_buff a3, offset 4 bytes
            addi.n      a3,   a3,  6                    // increment dest_buff pointer by 6 bytes
            j           _less_than_1
        _less_than_2:
        bnei  a10,  0x1,  _less_than_1                  // branch if less than 1 value left
            s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
            s8i         a15,  a3,  2                    // save  8 bits from a15 to dest_buff a3, offset 2 bytes
            addi.n      a3,   a3,  3                    // increment dest_buff pointer by 3 bytes
        _less_than_1:
        add     a3,  a3,  a6                            // dest_buff + dest_stride
        addi.n  a5,  a5,  -1                            // decrease the outer loop
        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
    bnez a5, .outer_loop
    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
    retw.n                                              // return
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S
@ -0,0 +1,60 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 // Memcpy macros for modulo checking
 // After running the main loop, there is need to check remaining bytes to be copied out of the main loop
 // Macros work with both, aligned and unaligned (4-byte boundary) memories
 // but performance is significantly lower when using unaligned memory, because of the unaligned memory access exception
 // Macro for checking modulo 8
 .macro macro_memcpy_mod_8 src_buf, dest_buf, condition, x1, x2, JUMP_TAG
    // Check modulo 8 of the \condition, if - then copy 8 bytes
    bbci \condition, 3, ._mod_8_check_\JUMP_TAG      // Branch if 3-rd bit of \condition is clear
        l32i.n      \x1,        \src_buf,   0        // Load 32 bits from \src_buff to \x1, offset 0
        l32i.n      \x2,        \src_buf,   4        // Load 32 bits from \src_buff to \x2, offset 4
        s32i.n      \x1,        \dest_buf,  0        // Save 32 bits from \x1 to \dest_buff, offset 0
        s32i.n      \x2,        \dest_buf,  4        // Save 32 bits from \x2 to \dest_buff, offset 4
        addi.n      \src_buf,   \src_buf,   8        // Increment \src_buff pointer by 8
        addi.n      \dest_buf,  \dest_buf,  8        // Increment \dest_buff pointer 8
    ._mod_8_check_\JUMP_TAG:
 .endm // macro_memcpy_mod_8
 // Macro for checking modulo 4
 .macro macro_memcpy_mod_4 src_buf, dest_buf, condition, x1, JUMP_TAG
    // Check modulo 4 of the \condition, if - then copy 4 bytes
    bbci \condition, 2, ._mod_4_check_\JUMP_TAG      // Branch if 2-nd bit of \condition is clear
        l32i.n      \x1,        \src_buf,   0        // Load 32 bits from \src_buff to \x1, offset 0
        addi.n      \src_buf,   \src_buf,   4        // Increment \src_buff pointer by 4
        s32i.n      \x1,        \dest_buf,  0        // Save 32 bits from \x1 to \dest_buff, offset 0
        addi.n      \dest_buf,  \dest_buf,  4        // Increment \dest_buff pointer 4
    ._mod_4_check_\JUMP_TAG:
 .endm // macro_memcpy_mod_4
 // Macro for checking modulo 2
 .macro macro_memcpy_mod_2 src_buf, dest_buf, condition, x1, JUMP_TAG
    // Check modulo 2 of the \condition, if - then copy 2 bytes
    bbci \condition, 1, ._mod_2_check_\JUMP_TAG      // Branch if 1-st bit of \condition is clear
        l16ui       \x1,        \src_buf,   0        // Load 16 bits from \src_buff to \x1, offset 0
        addi.n      \src_buf,   \src_buf,   2        // Increment \src_buff pointer by 2
        s16i        \x1,        \dest_buf,  0        // Save 16 bits from \x1 to \dest_buff, offset 0
        addi.n      \dest_buf,  \dest_buf,  2        // Increment \dest_buff pointer 2
    ._mod_2_check_\JUMP_TAG:
 .endm // macro_memcpy_mod_2
 // Macro for checking modulo 1
 .macro macro_memcpy_mod_1 src_buf, dest_buf, condition, x1, JUMP_TAG
    // Check modulo 1 of the \condition, if - then copy 1 byte
    bbci \condition, 0, ._mod_1_check_\JUMP_TAG      // Branch if 0-th bit of \condition is clear
        l8ui        \x1,        \src_buf,   0        // Load 8 bits from \src_buff to \x1, offset 0
        addi.n      \src_buf,   \src_buf,   1        // Increment \src_buff pointer by 1
        s8i         \x1,        \dest_buf,  0        // Save 8 bits from \x1 to \dest_buff, offset 0
        addi.n      \dest_buf,  \dest_buf,  1        // Increment \dest_buff pointer 1
    ._mod_1_check_\JUMP_TAG:
 .endm // macro_memcpy_mod_1
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
@ -0,0 +1,264 @@
 /*
 * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #include "lv_macro_memcpy.S"        // Memcpy macros
 // This is LVGL RGB565 image blend to RGB565 for ESP32 processor
    .section .text
    .align  4
    .global lv_rgb565_blend_normal_to_rgb565_esp
    .type   lv_rgb565_blend_normal_to_rgb565_esp,@function
 // The function implements the following C code:
 // void rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t * dsc);
 // Input params
 //
 // dsc - a2
 // typedef struct {
 //     uint32_t opa;                l32i    0
 //     void * dst_buf;              l32i    4
 //     uint32_t dst_w;              l32i    8
 //     uint32_t dst_h;              l32i    12
 //     uint32_t dst_stride;         l32i    16
 //     const void * src_buf;        l32i    20
 //     uint32_t src_stride;         l32i    24
 //     const lv_opa_t * mask_buf;   l32i    28
 //     uint32_t mask_stride;        l32i    32
 // } asm_dsc_t;
 lv_rgb565_blend_normal_to_rgb565_esp:
    entry    a1,    32
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
    l32i.n   a7,    a2,    20                   // a7 - src_buff
    l32i.n   a8,    a2,    24                   // a8 - src_stride            in bytes
    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
    // No need to convert any colors here, we are copying from rgb565 to rgb565
    // Check dest_w length
    bltui   a4,  8,  _matrix_width_check                // Branch if dest_w (a4) is lower than 8
    // Check memory alignment and input parameters lengths and decide which implementation to use
    movi.n   a10,   0x3                                 // a10 = 0x3 alignment mask (4-byte alignment)
    or       a15,   a7,    a3                           // a15 = src_buff (a7) OR dest_buff (a3)
    or       a15,   a15,   a6                           // a15 = a15 OR dest_stride (a6)
    or       a15,   a15,   a8                           // a15 = a15 OR src_stride (a8)
    or       a15,   a15,   a11                          // a15 = a15 OR dest_w_bytes (a11)
    and      a15,   a15,   a10                          // a15 = a15 AND alignment mask (a10)
    bnez     a15,   _alignment_check                    // Branch if a15 not equals to zero
 //**********************************************************************************************************************
    // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 4
    // dest_buff   (a3) - 4-byte aligned
    // src_buff    (a7) - 4-byte aligned
    // dest_stride (a6) - 4-byte multiple
    // src_stride  (a8) - 4-byte multiple
    // dest_w      (a4) - 4-byte multiple
    srli    a9,    a4,   3                              // a9 - loop_len = dest_w / 8
    // Convert strides to matrix paddings
    sub     a6,    a6,   a11                            // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,    a8,   a11                            // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_align:
        // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
        loopnez a9, ._main_loop_aligned
            l32i.n      a15,  a7,  0                    // Load 32 bits from src_buff a7 to a15, offset 0
            l32i.n      a14,  a7,  4                    // Load 32 bits from src_buff a7 to a14, offset 4
            l32i.n      a13,  a7,  8                    // Load 32 bits from src_buff a7 to a13, offset 8
            l32i.n      a12,  a7,  12                   // Load 32 bits from src_buff a7 to a12, offset 12
            s32i.n      a15,  a3,  0                    // Save 32 bits from a15 to dest_buff a3, offset 0
            s32i.n      a14,  a3,  4                    // Save 32 bits from a15 to dest_buff a3, offset 4
            s32i.n      a13,  a3,  8                    // Save 32 bits from a15 to dest_buff a3, offset 8
            s32i.n      a12,  a3,  12                   // Save 32 bits from a15 to dest_buff a3, offset 12
            addi.n      a7,   a7,  16                   // Increment src_buff pointer a7 by 16
            addi.n      a3,   a3,  16                   // Increment dest_buff pointer a3 by 16
        ._main_loop_aligned:
        // Finish the remaining bytes out of the main loop
        // Check modulo 8 of the dest_w_bytes (a11), if - then copy 8 bytes (4 RGB565 pixels)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy registers a14 a15
        macro_memcpy_mod_8 a7, a3, a11, a14, a15 __LINE__
        // Check modulo 4 of the dest_w_bytes (a11), if - then copy 4 bytes (2 RGB565 pixels)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
        macro_memcpy_mod_4 a7, a3, a11, a15 __LINE__
        // Check modulo 2 of the dest_w_bytes (a11), if - then copy 2 bytes (1 RGB565 pixel)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
        macro_memcpy_mod_2 a7, a3, a11, a15 __LINE__
        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
        macro_memcpy_mod_1 a7, a3, a11, a15 __LINE__
        add     a3,  a3,  a6                            // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                            // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                            // Decrease the outer loop
    bnez a5, .outer_loop_align
    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
    retw.n                                              // Return
 //**********************************************************************************************************************
    // The most general case - at leas one array is not aligned, or one parameter is not multiple of 4
    _alignment_check:
    // dest_buff   (a3) - 4-byte aligned, or not
    // src_buff    (a7) - 4-byte aligned, or not
    // dest_stride (a6) - 4-byte multiple, or not
    // src_stride  (a8) - 4-byte multiple, or not
    // dest_w      (a4) - 4-byte multiple, or not
    // Convert strides to matrix paddings
    sub     a6,    a6,   a11                            // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,    a8,   a11                            // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_unalign:
        extui       a13,  a3,   0,   2                  // Get last two bits of the dest_buff address a3, to a13
        movi.n      a15,  4                             // Move 4 to a15, for calculation of the destination alignment loop
        sub         a14,  a15,  a13                     // Calculate destination alignment loop length (a14 = 4 - a13)
        // In case of the dest_buff a3 being already aligned (for example by matrix padding), correct a14 value,
        // to prevent the destination aligning loop to run 4 times (to prevent aligning already aligned memory)
        moveqz      a14,  a13,  a13                     // If a13 is zero, move a13 to a14, move 0 to a14
        sub         a10,  a11,  a14                     // Get the dest_w_bytes after the aligning loop
        srli        a9,   a10,  4                       // Calculate main loop len (a9 = dest_w_bytes_local / 16)
        // Run dest_buff aligning loop byte by byte
        loopnez a14, ._dest_aligning_loop
            l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
            addi.n      a7,   a7,  1                    // Increment src_buff pointer a7 by 1
            s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
            addi.n      a3,   a3,  1                    // Increment dest_buff pointer a3 by 1
        ._dest_aligning_loop:
        // Destination is aligned, source is unaligned
        // For more information about this implementation, see chapter 3.3.2 Shifts and the Shift Amount Register (SAR)
        // in Xtensa Instruction Set Architecture (ISA) Reference Manual
        ssa8l       a7                                  // Set SAR_BYTE from src_buff a7 unalignment
        extui       a4,  a7,  0,  2                     // Get last 2 bits of the src_buff,  a4 = src_buff_unalignment 
        sub         a7,  a7,  a4                        // "align" the src_buff a7, to 4-byte boundary by decreasing it's pointer to the nearest aligned boundary
        // First preload for the loopnez cycle
        l32i.n      a15,  a7,  0                        // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0
        // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
        loopnez a9, ._main_loop_unalign
            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
            l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15
            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
            l32i.n      a12,  a7,   12                  // Load 32 bits from 4-byte aligned src_buff a7 to a12, offset 12
            src         a14,  a13,  a14                 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14
            s32i.n      a14,  a3,   4                   // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4
            l32i.n      a15,  a7,   16                  // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 16
            src         a13,  a12,  a13                 // Concatenate a12 and a13 and shift by SAR_BYTE amount to a13
            s32i.n      a13,  a3,   8                   // Save 32 bits from shift-corrected a13 to dest_buff a3, offset 8
            addi.n      a7,   a7,   16                  // Increment src_buff pointer a7 by 16
            src         a12,  a15,  a12                 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12
            s32i.n      a12,  a3,   12                  // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12
            addi.n      a3,   a3,   16                  // Increment dest_buff pointer a3 by 16
        ._main_loop_unalign:
        // Finish the remaining bytes out of the loop
        // Check modulo 8 of the dest_w_bytes_local (a10), if - then copy 8 bytes
        bbci a10, 3, _mod_8_check                       // Branch if 3-rd bit of dest_w_bytes_local is clear
            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
            l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
            addi.n      a7,   a7,   8                   // Increment src_buff pointer a7 by 8
            src         a14,  a13,  a14                 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14
            s32i.n      a14,  a3,   4                   // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4
            addi.n      a3,   a3,   8                   // Increment dest_buff pointer a3 by 8
            mov         a15,  a13                       // Prepare a15 for the next steps (copy a13 to a15)
        _mod_8_check:
        // Check modulo 4 of the dest_w_bytes_local (a10), if - then copy 4 bytes
        bbci a10, 2, _mod_4_check                       // Branch if 2-nd bit of dest_w_bytes_local is clear
            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
            addi.n      a7,   a7,   4                   // Increment src_buff pointer a7 by 4
            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
            addi.n      a3,   a3,   4                   // Increment dest_buff pointer a3 by 4
            mov         a15,  a14                       // Prepare a15 for the next steps (copy a14 to a15)
        _mod_4_check:
        extui       a13,  a10,  0,  2                   // Get the last 2 bytes of the dest_w_bytes_local (a10), a13 = a10[1:0], to find out how many bytes are needs copied and to increase src and dest pointer accordingly
        beqz        a13,  _mod_1_2_check                // Branch if a13 equal to zero, E.G. if there are no bytes to be copied
            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
            l32i.n      a12,  a3,   0                   // Get dest_buff value: Load 32 bits from 4-byte aligned dest_buff a3 to a12, offset 0
            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
            ssa8l       a10                             // Set SAR_BYTE from dest_w_bytes_local a10 length
            sll         a15,  a15                       // Shift the dest word a15 by SAR_BYTE amount
            srl         a12,  a12                       // Shift the src word a12 by SAR_BYTE amount
            ssa8b       a10                             // Set SAR_BYTE from dest_w_bytes_local a10 length
            src         a12,  a12,  a15                 // Concatenate a12 and a15 and shift by SAR_BYTE amount to a12
            s32i.n      a12,  a3,   0                   // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 0
            add         a7,   a7,   a13                 // Increment src_buff pointer a7, by amount of copied bytes (a13)
            add         a3,   a3,   a13                 // Increment dest_buff pointer a3, by amount of copied bytes (a13)
        _mod_1_2_check:
        add     a7,  a7,  a4                            // Correct the src_buff back by src_buff_unalignment (a4), after we have force-aligned it to 4-byte boundary before the main loop
        add     a3,  a3,  a6                            // dest_buff + dest_stride
        add     a7,  a7,  a8                            // src_buff + src_stride
        addi.n  a5,  a5,  -1                            // Decrease the outer loop
    bnez a5, .outer_loop_unalign
    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
    retw.n                                              // Return
 //**********************************************************************************************************************
    // Small matrix width, keep it simple for lengths less than 8 pixels
    _matrix_width_check:                                // Matrix width is greater or equal 8 pixels
    // Convert strides to matrix paddings
    sub     a6,  a6,  a11                               // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,  a8,  a11                               // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_short_matrix_length:
        // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run
        loopnez a4, ._main_loop_short_matrix_length
            l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
            l8ui        a14,  a7,  1                    // Load 8 bits from src_buff a7 to a14, offset 1
            s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
            s8i         a14,  a3,  1                    // Save 8 bits from a14 to dest_buff a3, offset 1
            addi.n      a7,   a7,  2                    // Increment src_buff pointer a7 by 1
            addi.n      a3,   a3,  2                    // Increment dest_buff pointer a3 by 2
        ._main_loop_short_matrix_length:
        // Finish remaining byte out of the main loop
        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
        macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__
        add     a3,  a3,  a6                            // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                            // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                            // Decrease the outer loop
    bnez a5, .outer_loop_short_matrix_length
    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
    retw.n                                              // Return
--- a/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
+++ b/Libraries/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
@ -0,0 +1,372 @@
 /*
 * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #include "lv_macro_memcpy.S"        // Memcpy macros
 // This is LVGL RGB565 image blend to RGB565 for ESP32S3 processor
    .section .text
    .align  4
    .global lv_rgb565_blend_normal_to_rgb565_esp
    .type   lv_rgb565_blend_normal_to_rgb565_esp,@function
 // The function implements the following C code:
 // void lv_color_blend_to_rgb565(_lv_draw_sw_blend_fill_dsc_t * dsc);
 // Input params
 //
 // dsc - a2
 // typedef struct {
 //     uint32_t opa;                l32i    0
 //     void * dst_buf;              l32i    4
 //     uint32_t dst_w;              l32i    8
 //     uint32_t dst_h;              l32i    12
 //     uint32_t dst_stride;         l32i    16
 //     const void * src_buf;        l32i    20
 //     uint32_t src_stride;         l32i    24
 //     const lv_opa_t * mask_buf;   l32i    28
 //     uint32_t mask_stride;        l32i    32
 // } asm_dsc_t;
 lv_rgb565_blend_normal_to_rgb565_esp:
    entry    a1,    32
    l32i.n   a3,    a2,    4                    // a3 - dest_buff
    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
    l32i.n   a7,    a2,    20                   // a7 - src_buff
    l32i.n   a8,    a2,    24                   // a8 - src_stride            in bytes
    movi.n   a10,   0xf                         // 0xf alignment mask (16-byte alignment)
    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
    // No need to convert any colors here, we are copying from rgb565 to rgb565
    // Check dest_w length
    bltui   a4,  8,  _matrix_width_check                    // Branch if dest_w (a4) is lower than 8
    // Check dest_buff alignment fist
    and     a15,   a10,  a3                                 // 16-byte alignment mask AND dest_buff pointer a3
    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
                                                            // Jump straight to the last implementation, since this is the only one which deals with unaligned destination arrays
    // Check src_buff alignment
    and     a15,   a10,  a7                                 // 16-byte alignment mask AND src_buff pointer a7
    bnez    a15,   _src_align_dest_unalign                  // Branch if a15 not equals to zero
                                                            // Jump to check, if the second or third implementation can be used (depends on both strides and dest_w)
    // Check dest_stride alignment
    and     a15,   a10,  a6                                 // 16-byte alignment mask AND dest_stride a6
    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
                                                            // Jump straight to the last implementation, since this is the only one which deals with destination stride not aligned
    // Check src_stride alignment
    and     a15,   a10,  a8                                 // 16-byte alignment mask AND src_stride a8
    bnez    a15,   _src_align_dest_unalign                  // Branch if a15 not equals to zero
                                                            // Jump to check, if the second or third implementation can be used (depends on dest_w_bytes)
    // Check dest_w_bytes alignment
    and     a15,   a10,  a11                                // 16-byte alignment mask AND dest_w_bytes
    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
                                                            // Jump straight to the last implementation, since this is the only one which deals with dest_w_bytes not aligned
 //**********************************************************************************************************************
    // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 16
    // dest_buff   (a3) - 16-byte aligned
    // src_buff    (a7) - 16-byte aligned
    // dest_stride (a6) - 16-byte multiple
    // src_stride  (a8) - 16-byte multiple
    // dest_w      (a4) - 16-byte multiple
    srli    a9,    a4,   4                                  // a9 - loop_len = dest_w / 16
    // Convert strides to matrix paddings
    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_align:
        // Run main loop which copies 32 bytes (16 RGB565 pixels) in one loop run
        loopnez  a9, ._main_loop_align                      // 32 bytes (16 RGB565 pixels) in one loop run
            ee.vld.128.ip q0, a7, 16                        // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16
            ee.vld.128.ip q1, a7, 16                        // Load 16 bytes from src_buff a7 to q1, increase src_buf pointer a7 by 16
            ee.vst.128.ip q0, a3, 16                        // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16
            ee.vst.128.ip q1, a3, 16                        // Store 16 bytes from q1 to dest_buff a3, increase dest_buff pointer a3 by 16
        ._main_loop_align:
        // Finish remaining bytes out of the main loop
        // Check modulo 16 of the dest_w, if - then copy 16 bytes (8 RGB565 pixels)
        bbci a11, 4, _align_mod_16_check                    // Branch if 4-th bit of dest_w_bytes a11 is clear
            ee.vld.128.ip q0, a7, 16                        // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16
            ee.vst.128.ip q0, a3, 16                        // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16
        _align_mod_16_check:
        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                                // Decrease the outer loop
    bnez a5, .outer_loop_align
    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
    retw.n                                                  // Return
    _src_align_dest_unalign:
    // Check dest_stride alignment
    and     a15,   a10,  a6                                 // 16-byte alignment mask AND dest_stride a6
    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
    // Check dest_w_bytes alignment
    and     a15,   a10,  a11                                // 16-byte alignment mask AND dest_w_bytes a11
    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
    // We don't check src_stride alignment for this implementation, as it can be either align, or unalign
 //**********************************************************************************************************************
    // Less ideal case - Only destination array is aligned, src array is unaligned
    //                   Source stride is either aligned or unaligned, destination stride must be aligned, dest_w_bytes must be aligned
    // dest_buff   (a3) - 16-byte aligned
    // src_buff    (a7) - unaligned
    // dest_stride (a6) - 16-byte multiple
    // src_stride  (a8) - does not matter if 16-byte multiple
    // dest_w      (a4) - 16-byte multiple
    // Convert strides to matrix paddings
    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    // Calculate modulo for non-aligned data
    movi    a15,  48                                        // a15 = 48 (main loop copies 48 bytes)
    quou    a9,   a11,  a15                                 // a9 = dest_w_bytes (a11) DIV 48 (15)
    remu    a12,  a11,  a15                                 // a12 = dest_w_bytes (a11) remainder after DIV 48 (15)
    .outer_loop_src_unalign_dest_align:
        ee.ld.128.usar.ip   q2,  a7,  16                    // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
        ee.ld.128.usar.ip   q3,  a7,  16                    // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
        // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run
        loopnez a9, ._main_loop_src_unalign_dest_align      // 48 bytes (24 RGB565 pixels) in one loop
            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q.ld.ip    q2,  a7,  16, q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q.ld.ip    q3,  a7,  16, q4, q2          // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q4,  a3,  16                  // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
        ._main_loop_src_unalign_dest_align:
        // Finish the main loop outside of the loop from Q registers preloads
        // Check modulo 32 of the loop_len_remainder, if - then copy 32 bytes (16 RGB565 pixels)
        bbci   a12,  5,   _unalign_mod_32_check             // Branch if 5-th bit of loop_len_remainder a12 is clear
            ee.src.q.ld.ip    q4,  a7,  0,  q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            j _end_of_row_src_unalign_dest_align
        _unalign_mod_32_check:
        // Check modulo 16 of the loop_len_remainder, if - then copy 16 bytes (8 RGB565 pixels)
        bbci   a12, 4,   _unalign_mod_16_check              // Branch if 4-th bit of loop_len_remainder a12 is clear
            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            addi              a7,  a7, -16                  // Correct the src_buff pointer a7, caused by q reg preload
            j _end_of_row_src_unalign_dest_align
        _unalign_mod_16_check:
        // Nothing to copy outside of the main loop
        addi    a7,  a7,  -32                               // Correct the src_buff pointer a7, caused by q reg preload
        _end_of_row_src_unalign_dest_align:
        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                                // Decrease the outer loop
    bnez a5, .outer_loop_src_unalign_dest_align
    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
    retw.n                                                  // Return
    _src_unalign_dest_unalign:
 //**********************************************************************************************************************
    // The most general case, can handle all the possible combinations
    // dest_buff   (a3) - unaligned
    // src_buff    (a7) - unaligned
    // dest_stride (a6) - not 16-byte multiple
    // src_stride  (a8) - not 16-byte multiple
    // dest_w      (a4) - not 16-byte multiple
    // Convert strides to matrix paddings
    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_all_unalign:
        // dest_buff alignment check
        and     a13,   a10,  a3                             // Alignment mask 0xf (a10) AND dest_buff pointer
        beqz    a13,   _dest_buff_aligned                   // Branch if a13 = 0 (if dest_buff is aligned)
        movi.n  a14,   16                                   // a14 = 16
        sub     a13,   a14,   a13                           // a13 = 16 - unalignment
        // Check modulo 8 of the unalignment a13, if - then copy 8 bytes (4 RGB565 pixels)
        // src_buff a7, dest_buff a3, unalignment a13, copy registers a14, a15
        macro_memcpy_mod_8 a7, a3, a13, a15, a14, __LINE__
        // Check modulo 4 of the unalignment, if - then copy 4 bytes (2 RGB565 pixels)
        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
        macro_memcpy_mod_4 a7, a3, a13, a15, __LINE__
        // Check modulo 2 of the unalignment, if - then copy 2 bytes (1 RGB565 pixel)
        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
        macro_memcpy_mod_2 a7, a3, a13, a15, __LINE__
        // Check modulo 1 of the unalignment, if - then copy 1 byte (1/2 of RGB565 pixel)
        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
        macro_memcpy_mod_1 a7, a3, a13, a15, __LINE__
        _dest_buff_aligned:
        // Calculate modulo for non-aligned data
        sub     a11,   a11,   a13                           // a11 = local_dest_w_bytes (a11) = dest_w_bytes (a11) - (16 - unalignment)
        movi    a15,    48                                  // a15 = 48
        quou    a9,    a11,   a15                           // a9 =  local_dest_w_bytes (a11) DIV 48 (a15)
        remu    a12,   a11,   a15                           // a12 = local_dest_w_bytes (a11) remainder after div 48 (a15)
        ee.ld.128.usar.ip   q2,  a7,  16                    // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
        ee.ld.128.usar.ip   q3,  a7,  16                    // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
        // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run
        loopnez a9, ._main_loop_all_unalign                 // 48 bytes (24 RGB565 pixels) in one loop
            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q.ld.ip    q2,  a7,  16, q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q.ld.ip    q3,  a7,  16, q4, q2          // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q4,  a3,  16                  // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
        ._main_loop_all_unalign:
        // Finish the main loop outside of the loop from Q registers preloads
        // Check modulo 32 and modulo 8 of the loop_len_remainder a12
        bbci   a12,  5,   _all_unalign_mod_32_check         // Branch if 5-th bit of loop_len_remainder a12 is clear
        bbsi   a12,  3,   _all_unalign_mod_32_mod_8_check   // Branch if 3-rd bif of loop_len_remainder a12 is set
            // Copy 32 bytes (16 RGB565 pixels) (47 - 40)
            ee.src.q.ld.ip    q4,  a7,  0,  q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            j _skip_mod16
            _all_unalign_mod_32_mod_8_check:
            // Copy 40 bytes (20 RGB565 pixels)
            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q.ld.ip    q2,  a7,  0,  q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, don't increase src_buf pointer a7
            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q          q4,  q4,  q2                  // Concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount
            ee.vst.l.64.ip    q4,  a3,  8                   // Store lower 8 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
            addi              a7,  a7,  -8                  // Correct the src_buff pointer a7, caused by q reg preload
            j _skip_mod16
        _all_unalign_mod_32_check:
        // Check modulo 16 and modulo 8 of the loop_len_remainder a12
        bbci   a12,  4,   _all_unalign_mod_16_check         // branch if 4-th bit of loop_len_remainder a12 is clear
        bbsi   a12,  3,   _all_unalign_mod_16_mod_8_check   // branch if 3-rd bit of loop_len_remainder a12 is set
            // Copy 16 bytes (8 RGB565 pixels)
            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            addi              a7,  a7, -16                  // Correct the src_buff pointer a7, caused by q reg preload
            j _skip_mod16
            _all_unalign_mod_16_mod_8_check:
            // Copy 24 bytes (12 RGB565 pixels)
            ee.src.q.ld.ip    q4,  a7,  0, q2, q3           // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
            ee.vst.l.64.ip    q3,  a3,  8                   // Store lower 8 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
            addi              a7,  a7, -8                   // Correct the src_buff pointer a7, caused by q reg preload
            j _skip_mod16
        _all_unalign_mod_16_check:
        bbci   a12, 3,  _all_unalign_mod_8_check            // Branch if 3-rd bit of loop_len_remainder a12 is clear
            // Copy 8 bytes (4 RGB565 pixels)
            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
            ee.vst.l.64.ip    q2,  a3,  8                   // Store lower 8 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
            addi              a7,  a7,  -24                 // Correct the src_buff pointer a7, caused by q reg preload
            j _skip_mod16
        _all_unalign_mod_8_check:
        addi    a7, a7, -32                                 // Correct the src_buff pointer a7, caused by q reg preload
        _skip_mod16:
        // Check modulo 4 of the loop_len_remainder, if - then copy 4 bytes (2 RGB565 pixels)
        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
        macro_memcpy_mod_4 a7, a3, a12, a15, __LINE__
        // Check modulo 2 of the loop_len_remainder, if - then copy 2 bytes (1 RGB565 pixel)
        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
        macro_memcpy_mod_2 a7, a3, a12, a15, __LINE__
        // Check modulo 1 of the loop_len_remainder, if - then copy 1 byte (1/2 RGB565 pixel)
        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
        macro_memcpy_mod_1 a7, a3, a12, a15, __LINE_
        slli    a11, a4,   1                                // Refresh dest_w_bytes
        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                                // Decrease the outer loop
    bnez a5, .outer_loop_all_unalign
    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
    retw.n                                                  // Return
 //**********************************************************************************************************************
    // Small matrix width, keep it simple for lengths less than 8 pixels
    _matrix_width_check:                                    // Matrix width is greater or equal 8 pixels
    // Convert strides to matrix paddings
    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
    .outer_loop_short_matrix_length:
        // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run
        loopnez a4, ._main_loop_short_matrix_length
            l8ui        a15,  a7,  0                        // Load 8 bits from src_buff a7 to a15, offset 0
            l8ui        a14,  a7,  1                        // Load 8 bits from src_buff a7 to a14, offset 1
            s8i         a15,  a3,  0                        // Save 8 bits from a15 to dest_buff a3, offset 0
            s8i         a14,  a3,  1                        // Save 8 bits from a14 to dest_buff a3, offset 1
            addi.n      a7,   a7,  2                        // Increment src_buff pointer a7 by 1
            addi.n      a3,   a3,  2                        // Increment dest_buff pointer a3 by 2
        ._main_loop_short_matrix_length:
        // Finish remaining byte out of the main loop
        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
        macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__
        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
        addi.n  a5,  a5,  -1                                // Decrease the outer loop
    bnez a5, .outer_loop_short_matrix_length
    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
    retw.n                                                  // Return
--- a/Libraries/esp_lvgl_port/test_apps/simd/README.md
+++ b/Libraries/esp_lvgl_port/test_apps/simd/README.md
@ -4,7 +4,7 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality-
 Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)
-## Benchmark results
+## Benchmark results for LV Fill functions (memset)
 | Color format | Matrix size | Memory alignment |  ASM version   | ANSI C version |
 | :----------- | :---------- | :--------------- | :------------- | :------------- |
@ -12,9 +12,20 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/)
 |              | 127x127     |      1 byte      |     0.488      |     1.597      |
 | RGB565       | 128x128     |     16 byte      |     0.196      |     1.146      |
 |              | 127x127     |      1 byte      |     0.497      |     1.124      |
 | RGB888       | 128x128     |     16 byte      |     0.608      |     4.062      |
 |              | 127x127     |      1 byte      |     0.818      |     3.969      |
 * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case)
 * the values represent cycles per sample to perform simple fill of the matrix on esp32s3
 ## Benchmark results for LV Image functions (memcpy)
 | Color format | Matrix size | Memory alignment |  ASM version   | ANSI C version |
 | :----------- | :---------- | :--------------- | :------------- | :------------- |
 | RGB565       | 128x128     |     16 byte      |     0.352      |     3.437      |
 |              | 127x128     |      1 byte      |     0.866      |     5.978      |
 * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x128 1 byte aligned matrix (worst case)
 * the values represent cycles per sample to perform memory copy between two matrices on esp32s3
 ## Functionality test
 * Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
 * A top-level flow of the functionality test:
@ -62,6 +73,8 @@ Here's the test menu, pick your combo:
 (2)	"Test fill functionality RGB565" [fill][functionality][RGB565]
 (3)	"LV Fill benchmark ARGB8888" [fill][benchmark][ARGB8888]
 (4)	"LV Fill benchmark RGB565" [fill][benchmark][RGB565]
 (5)	"LV Image functionality RGB565 blend to RGB565" [image][functionality][RGB565]
 (6)	"LV Image benchmark RGB565 blend to RGB565" [image][benchmark][RGB565]
 Enter test for running.
 ```
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
@ -8,6 +8,9 @@ if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
    else()
        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
    endif()
    file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S)        # Explicitly add all assembler macro files
 else()
    message(WARNING "This test app is intended only for esp32 and esp32s3")
 endif()
@ -15,7 +18,14 @@ endif()
 # Hard copy of LV files
 file(GLOB_RECURSE BLEND_SRCS lv_blend/src/*.c)
-idf_component_register(SRCS "test_app_main.c" "test_lv_fill_functionality.c" "test_lv_fill_benchmark.c" ${BLEND_SRCS} ${ASM_SOURCES}
+idf_component_register(SRCS "test_app_main.c"
                            "test_lv_fill_functionality.c"      # memset tests
                            "test_lv_fill_benchmark.c"
                            "test_lv_image_functionality.c"     # memcpy tests
                            "test_lv_image_benchmark.c"
                            ${BLEND_SRCS}                       # Hard copy of LVGL's blend API, to simplify testing
                            ${ASM_SOURCES}                      # Assembly src files
                            ${ASM_MACROS}                       # Assembly macro files
                      INCLUDE_DIRS "lv_blend/include" "../../../include"
                      REQUIRES unity
                      WHOLE_ARCHIVE)
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
@ -57,6 +57,7 @@ typedef struct {
    lv_color_format_t src_color_format;
    lv_opa_t opa;
    lv_blend_mode_t blend_mode;
    bool use_asm;
 } _lv_draw_sw_blend_image_dsc_t;
 /**********************
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h
@ -0,0 +1,53 @@
 /*
 * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * This file is derived from the LVGL project.
 * See https://github.com/lvgl/lvgl for details.
 */
 /**
 * @file lv_draw_sw_blend_rgb888.h
 *
 */
 #ifndef LV_DRAW_SW_BLEND_RGB888_H
 #define LV_DRAW_SW_BLEND_RGB888_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*********************
 *      INCLUDES
 *********************/
 #include "lv_draw_sw_blend.h"
 /*********************
 *      DEFINES
 *********************/
 /**********************
 *      TYPEDEFS
 **********************/
 /**********************
 * GLOBAL PROTOTYPES
 **********************/
 void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc,
        uint32_t dest_px_size);
 void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc,
        uint32_t dest_px_size);
 /**********************
 *      MACROS
 **********************/
 #ifdef __cplusplus
 } /*extern "C"*/
 #endif
 #endif /*LV_DRAW_SW_BLEND_RGB888_H*/
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h
@ -0,0 +1,79 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * This file is derived from the LVGL project.
 * See https://github.com/lvgl/lvgl for details.
 */
 /**
 * @file lv_string.h
 *
 */
 #ifndef LV_STRING_H
 #define LV_STRING_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*********************
 *      INCLUDES
 *********************/
 //#include "../lv_conf_internal.h"
 #include <stdint.h>
 #include <stddef.h>
 #include "lv_types.h"
 /*********************
 *      DEFINES
 *********************/
 /**********************
 *      TYPEDEFS
 **********************/
 /**********************
 * GLOBAL PROTOTYPES
 **********************/
 /**
 * @brief Copies a block of memory from a source address to a destination address.
 * @param dst Pointer to the destination array where the content is to be copied.
 * @param src Pointer to the source of data to be copied.
 * @param len Number of bytes to copy.
 * @return Pointer to the destination array.
 * @note The function does not check for any overlapping of the source and destination memory blocks.
 */
 void *lv_memcpy(void *dst, const void *src, size_t len);
 /**
 * @brief Fills a block of memory with a specified value.
 * @param dst Pointer to the destination array to fill with the specified value.
 * @param v Value to be set. The value is passed as an int, but the function fills
 *          the block of memory using the unsigned char conversion of this value.
 * @param len Number of bytes to be set to the value.
 */
 void lv_memset(void *dst, uint8_t v, size_t len);
 /**
 * @brief Move a block of memory from source to destination
 * @param dst Pointer to the destination array where the content is to be copied.
 * @param src Pointer to the source of data to be copied.
 * @param len Number of bytes to copy
 * @return Pointer to the destination array.
 */
 void *lv_memmove(void *dst, const void *src, size_t len);
 /**********************
 *      MACROS
 **********************/
 #ifdef __cplusplus
 } /*extern "C"*/
 #endif
 #endif /*LV_STRING_H*/
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
@ -19,6 +19,8 @@
 extern "C" {
 #endif
 #include <stdint.h>
 /**********************
 *      TYPEDEFS
 **********************/
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
@ -23,7 +23,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_math.h"
 #include "lv_color.h"
-#include "string.h"
+#include "lv_string.h"
 #include "esp_lvgl_port_lv_blend.h"
@ -628,7 +628,7 @@ static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc
                if (src_px_size == 4) {
                    uint32_t line_in_bytes = w * 4;
                    for (y = 0; y < h; y++) {
-                        memcpy(dest_buf_c32, src_buf, line_in_bytes);   // lv_memcpy
+                        lv_memcpy(dest_buf_c32, src_buf, line_in_bytes);
                        dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
                        src_buf = drawbuf_next_row(src_buf, src_stride);
                    }
@ -870,9 +870,9 @@ static inline lv_color32_t LV_ATTRIBUTE_FAST_MEM lv_color_32_32_mix(lv_color32_t
 void lv_color_mix_with_alpha_cache_init(lv_color_mix_alpha_cache_t *cache)
 {
-    memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    lv_memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
-    memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    lv_memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
-    memset(&cache->res_saved, 0x00, sizeof(lv_color32_t));  //lv_memzero
+    lv_memset(&cache->res_saved, 0x00, sizeof(lv_color32_t));  //lv_memzero
    cache->res_alpha_saved = 255;
    cache->ratio_saved = 255;
 }
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
@ -23,7 +23,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_math.h"
 #include "lv_color.h"
-#include "string.h"
+#include "lv_string.h"
 #include "esp_lvgl_port_lv_blend.h"
@ -601,10 +601,12 @@ static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
-            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)) {
+            if (dsc->use_asm) {
                LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc);
            } else {
                uint32_t line_in_bytes = w * 2;
                for (y = 0; y < h; y++) {
-                    memcpy(dest_buf_u16, src_buf_u16, line_in_bytes);   // lv_memcpy
+                    lv_memcpy(dest_buf_u16, src_buf_u16, line_in_bytes);
                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
                    src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
                }
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c
@ -0,0 +1,952 @@
 /*
 * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * This file is derived from the LVGL project.
 * See https://github.com/lvgl/lvgl for details.
 */
 /**
 * @file lv_draw_sw_blend_to_rgb888.c
 *
 */
 /*********************
 *      INCLUDES
 *********************/
 #include "lv_draw_sw_blend_to_rgb888.h"
 #include "lv_assert.h"
 #include "lv_types.h"
 #include "lv_log.h"
 #include "lv_draw_sw_blend.h"
 #include "lv_math.h"
 #include "lv_color.h"
 #include "lv_string.h"
 #include "esp_lvgl_port_lv_blend.h"
 /*********************
 *      DEFINES
 *********************/
 #define LV_ATTRIBUTE_FAST_MEM
 /**********************
 *      TYPEDEFS
 **********************/
 /**********************
 *  STATIC PROTOTYPES
 **********************/
 static void /* LV_ATTRIBUTE_FAST_MEM */ al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
 static void /* LV_ATTRIBUTE_FAST_MEM */ i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
 static inline uint8_t /* LV_ATTRIBUTE_FAST_MEM */ get_bit(const uint8_t *buf, int32_t bit_idx);
 static void /* LV_ATTRIBUTE_FAST_MEM */ l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
 static void /* LV_ATTRIBUTE_FAST_MEM */ rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
 static void /* LV_ATTRIBUTE_FAST_MEM */ rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
        const uint8_t dest_px_size,
        uint32_t src_px_size);
 static void /* LV_ATTRIBUTE_FAST_MEM */ argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
        uint32_t dest_px_size);
 static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix);
 static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix);
 static inline void /* LV_ATTRIBUTE_FAST_MEM */ blend_non_normal_pixel(uint8_t *dest, lv_color32_t src,
        lv_blend_mode_t mode);
 static inline void * /* LV_ATTRIBUTE_FAST_MEM */ drawbuf_next_row(const void *buf, uint32_t stride);
 /**********************
 *  STATIC VARIABLES
 **********************/
 /**********************
 *      MACROS
 **********************/
 #ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888
 #define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(...)                           LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA
 #define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(...)                  LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK
 #define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(...)                 LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA
 #define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(...)              LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888
 #define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(...)                       LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA
 #define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)              LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK
 #define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)             LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
 #define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)          LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888
 #define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(...)                   LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA
 #define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)          LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK
 #define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)         LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
 #define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)      LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888
 #define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(...)                   LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA
 #define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)          LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK
 #define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)         LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
 #define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)      LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888
 #define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(...)                 LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA
 #define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)        LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK
 #define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)       LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
 #define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)    LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888
 #define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(...)  LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA
 #define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(...)  LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK
 #define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(...)  LV_RESULT_INVALID
 #endif
 #ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA
 #define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(...)  LV_RESULT_INVALID
 #endif
 /**********************
 *   GLOBAL FUNCTIONS
 **********************/
 void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    const lv_opa_t *mask = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_stride = dsc->dest_stride;
    int32_t x;
    int32_t y;
    LV_UNUSED(w);
    LV_UNUSED(h);
    LV_UNUSED(x);
    LV_UNUSED(y);
    LV_UNUSED(opa);
    LV_UNUSED(mask);
    LV_UNUSED(mask_stride);
    LV_UNUSED(dest_stride);
    /*Simple fill*/
    if (mask == NULL && opa >= LV_OPA_MAX) {
        if (dsc->use_asm && dest_px_size == 3) {
            LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc, dest_px_size);
        } else {
            if (dest_px_size == 3) {
                uint8_t *dest_buf_u8 = dsc->dest_buf;
                uint8_t *dest_buf_ori = dsc->dest_buf;
                w *= dest_px_size;
                for (x = 0; x < w; x += 3) {
                    dest_buf_u8[x + 0] = dsc->color.blue;
                    dest_buf_u8[x + 1] = dsc->color.green;
                    dest_buf_u8[x + 2] = dsc->color.red;
                }
                dest_buf_u8 += dest_stride;
                for (y = 1; y < h; y++) {
                    lv_memcpy(dest_buf_u8, dest_buf_ori, w);
                    dest_buf_u8 += dest_stride;
                }
            }
            if (dest_px_size == 4) {
                uint32_t color32 = lv_color_to_u32(dsc->color);
                uint32_t *dest_buf_u32 = dsc->dest_buf;
                for (y = 0; y < h; y++) {
                    for (x = 0; x <= w - 16; x += 16) {
                        dest_buf_u32[x + 0] = color32;
                        dest_buf_u32[x + 1] = color32;
                        dest_buf_u32[x + 2] = color32;
                        dest_buf_u32[x + 3] = color32;
                        dest_buf_u32[x + 4] = color32;
                        dest_buf_u32[x + 5] = color32;
                        dest_buf_u32[x + 6] = color32;
                        dest_buf_u32[x + 7] = color32;
                        dest_buf_u32[x + 8] = color32;
                        dest_buf_u32[x + 9] = color32;
                        dest_buf_u32[x + 10] = color32;
                        dest_buf_u32[x + 11] = color32;
                        dest_buf_u32[x + 12] = color32;
                        dest_buf_u32[x + 13] = color32;
                        dest_buf_u32[x + 14] = color32;
                        dest_buf_u32[x + 15] = color32;
                    }
                    for (; x < w; x ++) {
                        dest_buf_u32[x] = color32;
                    }
                    dest_buf_u32 = drawbuf_next_row(dest_buf_u32, dest_stride);
                }
            }
        }
    }
    /*Opacity only*/
    else if (mask == NULL && opa < LV_OPA_MAX) {
        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
            uint32_t color32 = lv_color_to_u32(dsc->color);
            uint8_t *dest_buf = dsc->dest_buf;
            w *= dest_px_size;
            for (y = 0; y < h; y++) {
                for (x = 0; x < w; x += dest_px_size) {
                    lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], opa);
                }
                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
            }
        }
    }
    /*Masked with full opacity*/
    else if (mask && opa >= LV_OPA_MAX) {
        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
            uint32_t color32 = lv_color_to_u32(dsc->color);
            uint8_t *dest_buf = dsc->dest_buf;
            w *= dest_px_size;
            for (y = 0; y < h; y++) {
                uint32_t mask_x;
                for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) {
                    lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], mask[mask_x]);
                }
                dest_buf += dest_stride;
                mask += mask_stride;
            }
        }
    }
    /*Masked with opacity*/
    else {
        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
            uint32_t color32 = lv_color_to_u32(dsc->color);
            uint8_t *dest_buf = dsc->dest_buf;
            w *= dest_px_size;
            for (y = 0; y < h; y++) {
                uint32_t mask_x;
                for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) {
                    lv_color_24_24_mix((const uint8_t *) &color32, &dest_buf[x], LV_OPA_MIX2(opa, mask[mask_x]));
                }
                dest_buf += dest_stride;
                mask += mask_stride;
            }
        }
    }
 }
 void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    switch (dsc->src_color_format) {
    case LV_COLOR_FORMAT_RGB565:
        rgb565_image_blend(dsc, dest_px_size);
        break;
    case LV_COLOR_FORMAT_RGB888:
        rgb888_image_blend(dsc, dest_px_size, 3);
        break;
    case LV_COLOR_FORMAT_XRGB8888:
        rgb888_image_blend(dsc, dest_px_size, 4);
        break;
    case LV_COLOR_FORMAT_ARGB8888:
        argb8888_image_blend(dsc, dest_px_size);
        break;
    case LV_COLOR_FORMAT_L8:
        l8_image_blend(dsc, dest_px_size);
        break;
    case LV_COLOR_FORMAT_AL88:
        al88_image_blend(dsc, dest_px_size);
        break;
    case LV_COLOR_FORMAT_I1:
        i1_image_blend(dsc, dest_px_size);
        break;
    default:
        LV_LOG_WARN("Not supported source color format");
        break;
    }
 }
 /**********************
 *   STATIC FUNCTIONS
 **********************/
 static void LV_ATTRIBUTE_FAST_MEM i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf_u8 = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const uint8_t *src_buf_i1 = dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_x;
    int32_t src_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(dsc)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
                        dest_buf_u8[dest_x + 2] = chan_val;
                        dest_buf_u8[dest_x + 1] = chan_val;
                        dest_buf_u8[dest_x + 0] = chan_val;
                    }
                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
                }
            }
        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(dsc)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], opa);
                    }
                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
                }
            }
        } else if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(dsc)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], mask_buf[src_x]);
                    }
                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
                    mask_buf += mask_stride;
                }
            }
        } else if (mask_buf && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(dsc)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
                    }
                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        for (y = 0; y < h; y++) {
            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                lv_color32_t src_argb;
                src_argb.red = get_bit(src_buf_i1, src_x) * 255;
                src_argb.green = src_argb.red;
                src_argb.blue = src_argb.red;
                if (mask_buf == NULL) {
                    src_argb.alpha = opa;
                } else {
                    src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa);
                }
                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
            src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
        }
    }
 }
 static void LV_ATTRIBUTE_FAST_MEM al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf_u8 = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const lv_color16a_t *src_buf_al88 = dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_x;
    int32_t src_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], src_buf_al88[src_x].alpha);
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
                }
            }
        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa));
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
                }
            }
        } else if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha,
                                          mask_buf[src_x]));
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
                    mask_buf += mask_stride;
                }
            }
        } else if (mask_buf && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX3(src_buf_al88[src_x].alpha,
                                          mask_buf[src_x], opa));
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        for (y = 0; y < h; y++) {
            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                lv_color32_t src_argb;
                src_argb.red = src_argb.green = src_argb.blue = src_buf_al88[src_x].lumi;
                if (mask_buf == NULL) {
                    src_argb.alpha = LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa);
                } else {
                    src_argb.alpha = LV_OPA_MIX3(src_buf_al88[src_x].alpha, mask_buf[dest_x], opa);
                }
                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf_u8 += dest_stride;
            src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
        }
    }
 }
 static void LV_ATTRIBUTE_FAST_MEM l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf_u8 = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const uint8_t *src_buf_l8 = dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_x;
    int32_t src_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        dest_buf_u8[dest_x + 2] = src_buf_l8[src_x];
                        dest_buf_u8[dest_x + 1] = src_buf_l8[src_x];
                        dest_buf_u8[dest_x + 0] = src_buf_l8[src_x];
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
                }
            }
        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], opa);
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
                }
            }
        } else if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], mask_buf[src_x]);
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
                    mask_buf += mask_stride;
                }
            }
        } else if (mask_buf && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        lv_color32_t src_argb;
        for (y = 0; y < h; y++) {
            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                src_argb.red = src_buf_l8[src_x];
                src_argb.green = src_buf_l8[src_x];
                src_argb.blue = src_buf_l8[src_x];
                if (mask_buf == NULL) {
                    src_argb.alpha = opa;
                } else {
                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
                }
                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf_u8 += dest_stride;
            src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
        }
    }
 }
 static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf_u8 = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const lv_color16_t *src_buf_c16 = (const lv_color16_t *) dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t src_x;
    int32_t dest_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        dest_buf_u8[dest_x + 2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
                        dest_buf_u8[dest_x + 1] = (src_buf_c16[src_x].green * 1037) >> 8;
                        dest_buf_u8[dest_x + 0] = (src_buf_c16[src_x].blue * 2106) >> 8;
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
                }
            }
        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
                uint8_t res[3];
                for (y = 0; y < h; y++) {
                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/
                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], opa);
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
                }
            }
        } else if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
                uint8_t res[3];
                for (y = 0; y < h; y++) {
                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], mask_buf[src_x]);
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
                    mask_buf += mask_stride;
                }
            }
        } else {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
                uint8_t res[3];
                for (y = 0; y < h; y++) {
                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
                    }
                    dest_buf_u8 += dest_stride;
                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        lv_color32_t src_argb;
        for (y = 0; y < h; y++) {
            for (src_x = 0, dest_x = 0; src_x < w; src_x++, dest_x += dest_px_size) {
                src_argb.red = (src_buf_c16[src_x].red * 2106) >> 8;
                src_argb.green = (src_buf_c16[src_x].green * 1037) >> 8;
                src_argb.blue = (src_buf_c16[src_x].blue * 2106) >> 8;
                if (mask_buf == NULL) {
                    src_argb.alpha = opa;
                } else {
                    src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa);
                }
                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf_u8 += dest_stride;
            src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
        }
    }
 }
 static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, const uint8_t dest_px_size,
        uint32_t src_px_size)
 {
    int32_t w = dsc->dest_w * dest_px_size;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const uint8_t *src_buf = dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_x;
    int32_t src_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        /*Special case*/
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size)) {
                if (src_px_size == dest_px_size) {
                    for (y = 0; y < h; y++) {
                        lv_memcpy(dest_buf, src_buf, w);
                        dest_buf += dest_stride;
                        src_buf += src_stride;
                    }
                } else {
                    for (y = 0; y < h; y++) {
                        for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
                            dest_buf[dest_x + 0] = src_buf[src_x + 0];
                            dest_buf[dest_x + 1] = src_buf[src_x + 1];
                            dest_buf[dest_x + 2] = src_buf[src_x + 2];
                        }
                        dest_buf += dest_stride;
                        src_buf += src_stride;
                    }
                }
            }
        }
        if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size, src_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], opa);
                    }
                    dest_buf += dest_stride;
                    src_buf += src_stride;
                }
            }
        }
        if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size, src_px_size)) {
                uint32_t mask_x;
                for (y = 0; y < h; y++) {
                    for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) {
                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], mask_buf[mask_x]);
                    }
                    dest_buf += dest_stride;
                    src_buf += src_stride;
                    mask_buf += mask_stride;
                }
            }
        }
        if (mask_buf && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size, src_px_size)) {
                uint32_t mask_x;
                for (y = 0; y < h; y++) {
                    for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) {
                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], LV_OPA_MIX2(opa, mask_buf[mask_x]));
                    }
                    dest_buf += dest_stride;
                    src_buf += src_stride;
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        lv_color32_t src_argb;
        for (y = 0; y < h; y++) {
            for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
                src_argb.red = src_buf[src_x + 2];
                src_argb.green = src_buf[src_x + 1];
                src_argb.blue = src_buf[src_x + 0];
                if (mask_buf == NULL) {
                    src_argb.alpha = opa;
                } else {
                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
                }
                blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf += dest_stride;
            src_buf += src_stride;
        }
    }
 }
 static void LV_ATTRIBUTE_FAST_MEM argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
 {
    int32_t w = dsc->dest_w;
    int32_t h = dsc->dest_h;
    lv_opa_t opa = dsc->opa;
    uint8_t *dest_buf = dsc->dest_buf;
    int32_t dest_stride = dsc->dest_stride;
    const lv_color32_t *src_buf_c32 = dsc->src_buf;
    int32_t src_stride = dsc->src_stride;
    const lv_opa_t *mask_buf = dsc->mask_buf;
    int32_t mask_stride = dsc->mask_stride;
    int32_t dest_x;
    int32_t src_x;
    int32_t y;
    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], src_buf_c32[src_x].alpha);
                    }
                    dest_buf += dest_stride;
                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
                }
            }
        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], LV_OPA_MIX2(src_buf_c32[src_x].alpha, opa));
                    }
                    dest_buf += dest_stride;
                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
                }
            }
        } else if (mask_buf && opa >= LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x],
                                           LV_OPA_MIX2(src_buf_c32[src_x].alpha, mask_buf[src_x]));
                    }
                    dest_buf += dest_stride;
                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
                    mask_buf += mask_stride;
                }
            }
        } else if (mask_buf && opa < LV_OPA_MAX) {
            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
                for (y = 0; y < h; y++) {
                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x],
                                           LV_OPA_MIX3(src_buf_c32[src_x].alpha, mask_buf[src_x], opa));
                    }
                    dest_buf += dest_stride;
                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
                    mask_buf += mask_stride;
                }
            }
        }
    } else {
        lv_color32_t src_argb;
        for (y = 0; y < h; y++) {
            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x ++) {
                src_argb = src_buf_c32[src_x];
                if (mask_buf == NULL) {
                    src_argb.alpha = LV_OPA_MIX2(src_argb.alpha, opa);
                } else {
                    src_argb.alpha = LV_OPA_MIX3(src_argb.alpha, mask_buf[dest_x], opa);
                }
                blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode);
            }
            if (mask_buf) {
                mask_buf += mask_stride;
            }
            dest_buf += dest_stride;
            src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
        }
    }
 }
 static inline void LV_ATTRIBUTE_FAST_MEM blend_non_normal_pixel(uint8_t *dest, lv_color32_t src, lv_blend_mode_t mode)
 {
    uint8_t res[3] = {0, 0, 0};
    switch (mode) {
    case LV_BLEND_MODE_ADDITIVE:
        res[0] = LV_MIN(dest[0] + src.blue, 255);
        res[1] = LV_MIN(dest[1] + src.green, 255);
        res[2] = LV_MIN(dest[2] + src.red, 255);
        break;
    case LV_BLEND_MODE_SUBTRACTIVE:
        res[0] = LV_MAX(dest[0] - src.blue, 0);
        res[1] = LV_MAX(dest[1] - src.green, 0);
        res[2] = LV_MAX(dest[2] - src.red, 0);
        break;
    case LV_BLEND_MODE_MULTIPLY:
        res[0] = (dest[0] * src.blue) >> 8;
        res[1] = (dest[1] * src.green) >> 8;
        res[2] = (dest[2] * src.red) >> 8;
        break;
    default:
        LV_LOG_WARN("Not supported blend mode: %d", mode);
        return;
    }
    lv_color_24_24_mix(res, dest, src.alpha);
 }
 static inline void LV_ATTRIBUTE_FAST_MEM lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix)
 {
    if (mix == 0) {
        return;
    }
    if (mix >= LV_OPA_MAX) {
        dest[0] = src;
        dest[1] = src;
        dest[2] = src;
    } else {
        lv_opa_t mix_inv = 255 - mix;
        dest[0] = (uint32_t)((uint32_t)src * mix + dest[0] * mix_inv) >> 8;
        dest[1] = (uint32_t)((uint32_t)src * mix + dest[1] * mix_inv) >> 8;
        dest[2] = (uint32_t)((uint32_t)src * mix + dest[2] * mix_inv) >> 8;
    }
 }
 static inline void LV_ATTRIBUTE_FAST_MEM lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix)
 {
    if (mix == 0) {
        return;
    }
    if (mix >= LV_OPA_MAX) {
        dest[0] = src[0];
        dest[1] = src[1];
        dest[2] = src[2];
    } else {
        lv_opa_t mix_inv = 255 - mix;
        dest[0] = (uint32_t)((uint32_t)src[0] * mix + dest[0] * mix_inv) >> 8;
        dest[1] = (uint32_t)((uint32_t)src[1] * mix + dest[1] * mix_inv) >> 8;
        dest[2] = (uint32_t)((uint32_t)src[2] * mix + dest[2] * mix_inv) >> 8;
    }
 }
 static inline uint8_t LV_ATTRIBUTE_FAST_MEM get_bit(const uint8_t *buf, int32_t bit_idx)
 {
    return (buf[bit_idx / 8] >> (7 - (bit_idx % 8))) & 1;
 }
 static inline void *LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void *buf, uint32_t stride)
 {
    return (void *)((uint8_t *)buf + stride);
 }
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c
@ -0,0 +1,187 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * This file is derived from the LVGL project.
 * See https://github.com/lvgl/lvgl for details.
 */
 /**
 * @file lv_string.c
 */
 /*********************
 *      INCLUDES
 *********************/
 //#include "../../lv_conf_internal.h"
 #if LV_USE_STDLIB_STRING == LV_STDLIB_BUILTIN
 #include "lv_assert.h"
 #include "lv_log.h"
 #include "lv_math.h"
 #include "lv_string.h"
 /*********************
 *      DEFINES
 *********************/
 #ifdef LV_ARCH_64
 #define MEM_UNIT         uint64_t
 #define ALIGN_MASK       0x7
 #else
 #define MEM_UNIT         uint32_t
 #define ALIGN_MASK       0x3
 #endif
 #define LV_ATTRIBUTE_FAST_MEM
 /**********************
 *      TYPEDEFS
 **********************/
 /**********************
 *  STATIC PROTOTYPES
 **********************/
 /**********************
 *  STATIC VARIABLES
 **********************/
 /**********************
 *      MACROS
 **********************/
 #if LV_USE_LOG && LV_LOG_TRACE_MEM
 #define LV_TRACE_MEM(...) LV_LOG_TRACE(__VA_ARGS__)
 #else
 #define LV_TRACE_MEM(...)
 #endif
 #define _COPY(d, s) *d = *s; d++; s++;
 #define _SET(d, v) *d = v; d++;
 #define _REPEAT8(expr) expr expr expr expr expr expr expr expr
 /**********************
 *   GLOBAL FUNCTIONS
 **********************/
 void *LV_ATTRIBUTE_FAST_MEM lv_memcpy(void *dst, const void *src, size_t len)
 {
    uint8_t *d8 = dst;
    const uint8_t *s8 = src;
    /*Simplify for small memories*/
    if (len < 16) {
        while (len) {
            *d8 = *s8;
            d8++;
            s8++;
            len--;
        }
        return dst;
    }
    lv_uintptr_t d_align = (lv_uintptr_t)d8 & ALIGN_MASK;
    lv_uintptr_t s_align = (lv_uintptr_t)s8 & ALIGN_MASK;
    /*Byte copy for unaligned memories*/
    if (s_align != d_align) {
        while (len > 32) {
            _REPEAT8(_COPY(d8, s8));
            _REPEAT8(_COPY(d8, s8));
            _REPEAT8(_COPY(d8, s8));
            _REPEAT8(_COPY(d8, s8));
            len -= 32;
        }
        while (len) {
            _COPY(d8, s8)
            len--;
        }
        return dst;
    }
    /*Make the memories aligned*/
    if (d_align) {
        d_align = ALIGN_MASK + 1 - d_align;
        while (d_align && len) {
            _COPY(d8, s8);
            d_align--;
            len--;
        }
    }
    uint32_t *d32 = (uint32_t *)d8;
    const uint32_t *s32 = (uint32_t *)s8;
    while (len > 32) {
        _REPEAT8(_COPY(d32, s32))
        len -= 32;
    }
    d8 = (uint8_t *)d32;
    s8 = (const uint8_t *)s32;
    while (len) {
        _COPY(d8, s8)
        len--;
    }
    return dst;
 }
 void LV_ATTRIBUTE_FAST_MEM lv_memset(void *dst, uint8_t v, size_t len)
 {
    uint8_t *d8 = (uint8_t *)dst;
    uintptr_t d_align = (lv_uintptr_t) d8 & ALIGN_MASK;
    /*Make the address aligned*/
    if (d_align) {
        d_align = ALIGN_MASK + 1 - d_align;
        while (d_align && len) {
            _SET(d8, v);
            len--;
            d_align--;
        }
    }
    uint32_t v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24);
    uint32_t *d32 = (uint32_t *)d8;
    while (len > 32) {
        _REPEAT8(_SET(d32, v32));
        len -= 32;
    }
    d8 = (uint8_t *)d32;
    while (len) {
        _SET(d8, v);
        len--;
    }
 }
 void *LV_ATTRIBUTE_FAST_MEM lv_memmove(void *dst, const void *src, size_t len)
 {
    if (dst < src || (char *)dst > ((char *)src + len)) {
        return lv_memcpy(dst, src, len);
    }
    if (dst > src) {
        char *tmp = (char *)dst + len - 1;
        char *s   = (char *)src + len - 1;
        while (len--) {
            *tmp-- = *s--;
        }
    } else {
        char *tmp = (char *)dst;
        char *s   = (char *)src;
        while (len--) {
            *tmp++ = *s++;
        }
    }
    return dst;
 }
 /**********************
 *   STATIC FUNCTIONS
 **********************/
 #endif /*LV_STDLIB_BUILTIN*/
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -42,7 +42,8 @@ typedef struct {
        void *p_asm_alloc;                                  // pointer to the beginning of the memory allocated for ASM test buf, used in free()
        void *p_ansi_alloc;                                 // pointer to the beginning of the memory allocated for ANSI test buf, used in free()
    } buf;
-    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *);              // pointer to LVGL API function
    void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument
    lv_color_format_t color_format;                         // LV color format
    size_t data_type_size;                                  // Used data type size, eg sizeof()
    size_t active_buf_len;                                  // Length of buffer, where the actual data are stored (not including Canary bytes)
@ -64,8 +65,9 @@ typedef struct {
    unsigned int cc_width;                                  // Corner case test array width
    unsigned int benchmark_cycles;                          // Count of benchmark cycles
    void *array_align16;                                    // test array with 16 byte alignment - testing most ideal case
-    void *array_align1;                                     // test array with 1 byte alignment - testing wort case
+    void *array_align1;                                     // test array with 1 byte alignment - testing worst case
-    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *);              // pointer to LVGL API function
    void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument
 } bench_test_case_params_t;
 #ifdef __cplusplus
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/lv_image_common.h
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/lv_image_common.h
@ -0,0 +1,111 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #pragma once
 #include "esp_err.h"
 #include <stdint.h>
 #include "lv_color.h"
 #include "lv_draw_sw_blend.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // ------------------------------------------------- Macros and Types --------------------------------------------------
 /**
 * @brief Type of blend DUT function
 */
 typedef enum {
    OPERATION_FILL,
    OPERATION_FILL_WITH_OPA,
 } blend_operation_t;
 /**
 * @brief Canary pixels amount depending on data type
 * @note
 *    - We should use at least 16 bytes of memory for canary pixels because of esp32s3 TIE 16-bytes wide Q registers
 *    - Canary pixels are multiplied by sizeof(used_data_type) to get the memory length occupied by the canary pixels
 *    - The memory occupied by canary pixels should be in 16-byte multiples, to achieve 16-byte memory alignment in functionality test
 *    - For example, ideally, for RGB565 we would need 8 canary pixels -> 8 * sizeof(uint16_t) = 16
 */
 typedef enum {
    CANARY_PIXELS_ARGB8888 = 4,                                /*!< Canary pixels: 4 * sizeof(uint32_t) = 16 */
    CANARY_PIXELS_RGB565 = 8,                                  /*!< Canary pixels: 8 * sizeof(uint16_t) = 16 */
 } canary_pixels_t;
 /**
 * @brief Functionality test combinations for LV Image
 */
 typedef struct {
    unsigned int min_w;                                       /*!< Minimum width of the test array */
    unsigned int min_h;                                       /*!< Minimum height of the test array */
    unsigned int max_w;                                       /*!< Maximum width of the test array */
    unsigned int max_h;                                       /*!< Maximum height of the test array */
    unsigned int src_min_unalign_byte;                        /*!< Minimum amount of unaligned bytes of the source test array */
    unsigned int dest_min_unalign_byte;                       /*!< Minimum amount of unaligned bytes of the destination test array */
    unsigned int src_max_unalign_byte;                        /*!< Maximum amount of unaligned bytes of the source test array */
    unsigned int dest_max_unalign_byte;                       /*!< Maximum amount of unaligned bytes of the destination test array */
    unsigned int src_unalign_step;                            /*!< Increment step in bytes unalignment of the source test array */
    unsigned int dest_unalign_step;                           /*!< Increment step in bytes unalignment of the destination test array */
    unsigned int src_stride_step;                             /*!< Increment step in destination stride of the source test array */
    unsigned int dest_stride_step;                            /*!< Increment step in destination stride of the destination test array */
    unsigned int test_combinations_count;                     /*!< Count of fest combinations */
 } test_matrix_lv_image_params_t;
 /**
 * @brief Functionality test case parameters for LV Image
 */
 typedef struct {
    struct {
        void *p_src;                                          /*!< pointer to the source test buff (common src buffer for both the ANSI and ASM)  */
        void *p_src_alloc;                                    /*!< pointer to the beginning of the memory allocated for the source ASM test buf, used in free() */
        void *p_dest_asm;                                     /*!< pointer to the destination ASM test buf */
        void *p_dest_ansi;                                    /*!< pointer to the destination ANSI test buf */
        void *p_dest_asm_alloc;                               /*!< pointer to the beginning of the memory allocated for the destination ASM test buf, used in free() */
        void *p_dest_ansi_alloc;                              /*!< pointer to the beginning of the memory allocated for the destination ANSI test buf, used in free() */
    } buf;
    void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *);  /*!< pointer to LVGL API function */
    lv_color_format_t color_format;                           /*!< LV color format */
    size_t src_data_type_size;                                /*!< Used data type size in the source buffer, eg sizeof(src_buff[0]) */
    size_t dest_data_type_size;                               /*!< Used data type size in the destination buffer, eg sizeof(dest_buff[0]) */
    size_t src_buf_len;                                       /*!< Length of the source buffer, including matrix padding (no Canary pixels are used for source buffer) */
    size_t active_dest_buf_len;                               /*!< Length of the destination buffer, where the actual data are stored, including matrix padding, not including Canary pixels */
    size_t total_dest_buf_len;                                /*!< Total length of the destination buffer (including Canary pixels and matrix padding) */
    size_t canary_pixels;                                     /*!< Canary pixels must be adjusted according to the used color type, to achieve aligned memory effect */
    unsigned int dest_w;                                      /*!< Destination buffer width */
    unsigned int dest_h;                                      /*!< Destination buffer height */
    unsigned int src_stride;                                  /*!< Source buffer stride */
    unsigned int dest_stride;                                 /*!< Destination buffer stride */
    unsigned int src_unalign_byte;                            /*!< Source buffer memory unalignment */
    unsigned int dest_unalign_byte;                           /*!< Destination buffer memory unalignment */
    blend_operation_t operation_type;                         /*!< Type of fundamental blend operation */
 } func_test_case_lv_image_params_t;
 /**
 * @brief Benchmark test case parameters for LV Image
 */
 typedef struct {
    unsigned int height;                                      /*!< Test array height */
    unsigned int width;                                       /*!< Test array width */
    unsigned int dest_stride;                                 /*!< Destination test array stride */
    unsigned int src_stride;                                  /*!< Source test array stride */
    unsigned int cc_height;                                   /*!< Corner case test array height */
    unsigned int cc_width;                                    /*!< Corner case test array width */
    unsigned int benchmark_cycles;                            /*!< Count of benchmark cycles */
    void *src_array_align16;                                  /*!< Source test array with 16 byte alignment - testing most ideal case */
    void *src_array_align1;                                   /*!< Source test array with 1 byte alignment - testing worst case */
    void *dest_array_align16;                                 /*!< Destination test array with 16 byte alignment - testing most ideal case */
    void *dest_array_align1;                                  /*!< Destination test array with 1 byte alignment - testing worst case */
    void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *);  /*!< pointer to LVGL API function */
 } bench_test_case_lv_image_params_t;
 #ifdef __cplusplus
 } /*extern "C"*/
 #endif
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -15,6 +15,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_argb8888.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
 #include "lv_draw_sw_blend_to_rgb888.h"
 #define WIDTH 128
 #define HEIGHT 128
@ -115,6 +116,31 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]")
    lv_fill_benchmark_init(&test_params);
    free(dest_array_align16);
 }
 TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]")
 {
    uint8_t *dest_array_align16  = (uint8_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint8_t) * 3 + UNALIGN_BYTES);
    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
    // Apply byte unalignment for the worst-case test scenario
    uint8_t *dest_array_align1 = dest_array_align16 + UNALIGN_BYTES;
    bench_test_case_params_t test_params = {
        .height = HEIGHT,
        .width = WIDTH,
        .stride = STRIDE * 3,
        .cc_height = HEIGHT - 1,
        .cc_width = WIDTH - 1,
        .benchmark_cycles = BENCHMARK_CYCLES,
        .array_align16 = (void *)dest_array_align16,
        .array_align1 = (void *)dest_array_align1,
        .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
    };
    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB888 color format");
    lv_fill_benchmark_init(&test_params);
    free(dest_array_align16);
 }
 // ------------------------------------------------ Static test functions ----------------------------------------------
 static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
@ -162,11 +188,21 @@ static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
 static float lv_fill_benchmark_run(bench_test_case_params_t *test_params, _lv_draw_sw_blend_fill_dsc_t *dsc)
 {
    // Call the DUT function for the first time to init the benchmark test
-    test_params->blend_api_func(dsc);
+    if (test_params->blend_api_func != NULL) {
        test_params->blend_api_func(dsc);
    } else if (test_params->blend_api_px_func != NULL) {
        test_params->blend_api_px_func(dsc, 3);
    }
    const unsigned int start_b = xthal_get_ccount();
-    for (int i = 0; i < test_params->benchmark_cycles; i++) {
+    if (test_params->blend_api_func != NULL) {
-        test_params->blend_api_func(dsc);
+        for (int i = 0; i < test_params->benchmark_cycles; i++) {
            test_params->blend_api_func(dsc);
        }
    } else if (test_params->blend_api_px_func != NULL) {
        for (int i = 0; i < test_params->benchmark_cycles; i++) {
            test_params->blend_api_px_func(dsc, 3);
        }
    }
    const unsigned int end_b = xthal_get_ccount();
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -13,6 +13,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_argb8888.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
 #include "lv_draw_sw_blend_to_rgb888.h"
 // ------------------------------------------------- Defines -----------------------------------------------------------
@ -47,14 +48,14 @@ static lv_color_t test_color = {
 * - generate functionality test combinations, based on the provided test_matrix struct
 *
 * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case);
 /**
 * @brief Fill test buffers for functionality test
 *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void fill_test_bufs(func_test_case_params_t *test_case);
@ -63,24 +64,31 @@ static void fill_test_bufs(func_test_case_params_t *test_case);
 *
 * - function prepares structures for functionality testing and runs the LVGL API
 *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void lv_fill_functionality(func_test_case_params_t *test_case);
 /**
 * @brief Evaluate results for 32bit data length
 *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void test_eval_32bit_data(func_test_case_params_t *test_case);
 /**
 * @brief Evaluate results for 16bit data length
 *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void test_eval_16bit_data(func_test_case_params_t *test_case);
 /**
 * @brief Evaluate results for 24bit data length
 *
 * @param[in] test_case Pointer to structure defining functionality test case
 */
 static void test_eval_24bit_data(func_test_case_params_t *test_case);
 // ------------------------------------------------ Test cases ---------------------------------------------------------
 /*
@ -147,6 +155,29 @@ TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
    functionality_test_matrix(&test_matrix, &test_case);
 }
 TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]")
 {
    test_matrix_params_t test_matrix = {
        .min_w = 12,             // 12 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
        .min_h = 1,
        .max_w = 32,
        .max_h = 3,
        .min_unalign_byte = 0,
        .max_unalign_byte = 16,
        .unalign_step = 1,
        .dest_stride_step = 1,
        .test_combinations_count = 0,
    };
    func_test_case_params_t test_case = {
        .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
        .color_format = LV_COLOR_FORMAT_RGB888,
        .data_type_size = sizeof(uint8_t) * 3,   // 24-bit data length
    };
    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB888 color format");
    functionality_test_matrix(&test_matrix, &test_case);
 }
 // ------------------------------------------------ Static test functions ----------------------------------------------
 static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case)
@ -195,8 +226,13 @@ static void lv_fill_functionality(func_test_case_params_t *test_case)
    dsc_ansi.dest_buf = test_case->buf.p_ansi;
    dsc_ansi.use_asm = false;
-    test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
+    if (test_case->blend_api_func != NULL) {
-    test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
+        test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
        test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
    } else if (test_case->blend_api_px_func != NULL) {
        test_case->blend_api_px_func(&dsc_asm, 3);    // Call the LVGL API with Assembly code with set pixel size
        test_case->blend_api_px_func(&dsc_ansi, 3);   // Call the LVGL API with ANSI code with set pixel size
    }
    // Shift array pointers by Canary Bytes amount back
    test_case->buf.p_asm -= CANARY_BYTES * test_case->data_type_size;
@ -216,6 +252,11 @@ static void lv_fill_functionality(func_test_case_params_t *test_case)
        break;
    }
    case LV_COLOR_FORMAT_RGB888: {
        test_eval_24bit_data(test_case);
        break;
    }
    default:
        TEST_ASSERT_MESSAGE(false, "LV Color format not found");
    }
@ -309,3 +350,34 @@ static void test_eval_16bit_data(func_test_case_params_t *test_case)
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
 }
 static void test_eval_24bit_data(func_test_case_params_t *test_case)
 {
    // Print results, 24bit data
 #if DBG_PRINT_OUTPUT
    size_t data_type_size = test_case->data_type_size;
    for (uint32_t i = 0; i < test_case->total_buf_len; i++) {
        uint32_t ansi_value = ((uint8_t *)test_case->buf.p_ansi)[i * data_type_size]
                              | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 1] << 8)
                              | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 2] << 16);
        uint32_t asm_value  = ((uint8_t *)test_case->buf.p_asm)[i * data_type_size]
                              | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 1] << 8)
                              | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 2] << 16);
        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx32" \t asm = %8"PRIx32" \n", i, ((i < 10) ? (" ") : ("")), ansi_value, asm_value);
    }
    printf("\n");
 #endif
    const int canary_bytes_area = CANARY_BYTES * test_case->data_type_size;
    // Canary bytes area must stay 0
    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi, canary_bytes_area, test_msg_buf);
    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm, canary_bytes_area, test_msg_buf);
    // dest_buf_asm and dest_buf_ansi must be equal
    TEST_ASSERT_EQUAL_UINT8_ARRAY_MESSAGE((uint8_t *)test_case->buf.p_asm + canary_bytes_area, (uint8_t *)test_case->buf.p_ansi + canary_bytes_area, test_case->active_buf_len * test_case->data_type_size, test_msg_buf);
    // Canary bytes area must stay 0
    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES) * test_case->data_type_size, canary_bytes_area, test_msg_buf);
    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES) * test_case->data_type_size, canary_bytes_area, test_msg_buf);
 }
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
@ -0,0 +1,171 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #include <string.h>
 #include <malloc.h>
 #include <sdkconfig.h>
 #include "unity.h"
 #include "esp_log.h"
 #include "freertos/FreeRTOS.h"  // for xthal_get_ccount()
 #include "lv_image_common.h"
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
 #define COMMON_DIM 128      // Common matrix dimension 128x128 pixels
 #define WIDTH COMMON_DIM
 #define HEIGHT COMMON_DIM
 #define STRIDE WIDTH
 #define UNALIGN_BYTES 3
 #define BENCHMARK_CYCLES 1000
 // ------------------------------------------------ Static variables ---------------------------------------------------
 static const char *TAG_LV_IMAGE_BENCH = "LV Image Benchmark";
 static const char *asm_ansi_func[] = {"ASM", "ANSI"};
 // ------------------------------------------------ Static function headers --------------------------------------------
 /**
 * @brief Initialize the benchmark test
 */
 static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params);
 /**
 * @brief Run the benchmark test
 */
 static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc);
 // ------------------------------------------------ Test cases ---------------------------------------------------------
 /*
 Benchmark tests
 Requires:
    - To pass functionality tests first
 Purpose:
    - Test that an acceleration is achieved by an assembly implementation of LVGL blending API
 Procedure:
    - Initialize input parameters (test array length, width, allocate array...) of the benchmark test
    - Run assembly version of LVGL blending API multiple times (1000-times or so)
    - Firstly use an input test parameters for the most ideal case (16-byte aligned arrays, arrays widths divisible by 2 for RGB565 color format)
    - Then use worst-case input test parameters (1-byte aligned arrays, arrays width NOT divisible by 2 for RGB565 color format)
    - Count how many CPU cycles does it take to run a function from the LVGL blending API for each case (ideal and worst case)
    - Run ansi version of LVGL blending API multiple times (1000-times or so) and repeat the 2 above steps for the ansi version
    - Compare the results
    - Free test arrays and structures needed for LVGL blending API
 Inducing Most ideal and worst case scenarios:
    - Most ideal:
        - Both, the source and the destination buffers should be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries
        - Matrix width (in pixels) should be equal to the main loop length in the assembly src code
          typically multiples of 16 bytes (for RGB565 it's either 32 bytes - 16 pixels or 48 bytes - 24 pixels)
        - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance
        - Matrix strides, should be equal to the matrix widths (0 matrix padding), or their multiples (matrix width = matrix padding)
    - Worst case:
        - Both, hte source and the destination buffers should NOT be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries,
          Source buffer unalignment should be different from the destination unalignment, with one unalignment being even, the other being odd
          The unalignments shall be small numbers (preferably 1 or 2 bytes)
        - Matrix width should be one pixels smaller, than the matrix width for the most ideal case
        - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance
        - Matrix strides, should NOT be equal to the matrix widths (non 0 matrix padding)
 */
 // ------------------------------------------------ Test cases stages --------------------------------------------------
 TEST_CASE("LV Image benchmark RGB565 blend to RGB565", "[image][benchmark][RGB565]")
 {
    uint16_t *dest_array_align16  = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES);
    uint16_t *src_array_align16  = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES);
    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
    TEST_ASSERT_NOT_EQUAL(NULL, src_array_align16);
    // Apply byte unalignment (different for each array) for the worst-case test scenario
    uint16_t *dest_array_align1 = (uint16_t *)((uint8_t *)dest_array_align16 + UNALIGN_BYTES - 1);
    uint16_t *src_array_align1 = (uint16_t *)((uint8_t *)src_array_align16 + UNALIGN_BYTES);
    bench_test_case_lv_image_params_t test_params = {
        .height = HEIGHT,
        .width = WIDTH,
        .dest_stride = STRIDE * sizeof(uint16_t),
        .src_stride = STRIDE * sizeof(uint16_t),
        .cc_height = HEIGHT,
        .cc_width = WIDTH - 1,
        .benchmark_cycles = BENCHMARK_CYCLES,
        .src_array_align16 = (void *)src_array_align16,
        .src_array_align1 = (void *)src_array_align1,
        .dest_array_align16 = (void *)dest_array_align16,
        .dest_array_align1 = (void *)dest_array_align1,
        .blend_api_func = &lv_draw_sw_blend_image_to_rgb565,
    };
    ESP_LOGI(TAG_LV_IMAGE_BENCH, "running test for RGB565 color format");
    lv_image_benchmark_init(&test_params);
    free(dest_array_align16);
    free(src_array_align16);
 }
 // ------------------------------------------------ Static test functions ----------------------------------------------
 static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params)
 {
    // Init structure for LVGL blend API, to call the Assembly API
    _lv_draw_sw_blend_image_dsc_t dsc = {
        .dest_buf = test_params->dest_array_align16,
        .dest_w = test_params->width,
        .dest_h = test_params->height,
        .dest_stride = test_params->dest_stride,  // stride * sizeof()
        .mask_buf = NULL,
        .src_buf = test_params->src_array_align16,
        .src_stride = test_params->src_stride,
        .src_color_format = LV_COLOR_FORMAT_RGB565,
        .opa = LV_OPA_MAX,
        .blend_mode = LV_BLEND_MODE_NORMAL,
        .use_asm = true,
    };
    // Init structure for LVGL blend API, to call the ANSI API
    _lv_draw_sw_blend_image_dsc_t dsc_cc = dsc;
    dsc_cc.dest_buf = test_params->dest_array_align1;
    dsc_cc.dest_w = test_params->cc_width;
    dsc_cc.dest_h = test_params->cc_height;
    dsc_cc.src_buf = test_params->src_array_align1;
    // Run benchmark 2 times:
    // First run using assembly, second run using ANSI
    for (int i = 0; i < 2; i++) {
        // Run benchmark with the most ideal input parameters
        float cycles = lv_image_benchmark_run(test_params, &dsc);        // Call Benchmark cycle
        float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h));
        ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample);
        // Run benchmark with the corner case input parameters
        cycles = lv_image_benchmark_run(test_params, &dsc_cc);           // Call Benchmark cycle
        per_sample = cycles / ((float)(dsc_cc.dest_w * dsc_cc.dest_h));
        ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s corner case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample\n", asm_ansi_func[i], cycles, dsc_cc.dest_w, dsc_cc.dest_h, per_sample);
        // change to ANSI
        dsc.use_asm = false;
        dsc_cc.use_asm = false;
    }
 }
 static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc)
 {
    // Call the DUT function for the first time to init the benchmark test
    test_params->blend_api_func(dsc);
    const unsigned int start_b = xthal_get_ccount();
    for (int i = 0; i < test_params->benchmark_cycles; i++) {
        test_params->blend_api_func(dsc);
    }
    const unsigned int end_b = xthal_get_ccount();
    const float total_b = end_b - start_b;
    const float cycles = total_b / (test_params->benchmark_cycles);
    return cycles;
 }
--- a/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
+++ b/Libraries/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
@ -0,0 +1,351 @@
 /*
 * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #include <string.h>
 #include <malloc.h>
 #include <inttypes.h>
 #include "sdkconfig.h"
 #include "unity.h"
 #include "esp_log.h"
 #include "lv_image_common.h"
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
 // ------------------------------------------------- Defines -----------------------------------------------------------
 #define DBG_PRINT_OUTPUT false
 // ------------------------------------------------- Macros and Types --------------------------------------------------
 #define UPDATE_TEST_CASE(test_case_ptr, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte) ({  \
    (test_case_ptr)->src_buf_len = (size_t)(dest_h * src_stride);                                 \
    (test_case_ptr)->active_dest_buf_len = (size_t)(dest_h * dest_stride);                        \
    (test_case_ptr)->total_dest_buf_len = (size_t)((dest_h * dest_stride) + (test_case_ptr->canary_pixels * 2));  \
    (test_case_ptr)->dest_w = (dest_w);                         \
    (test_case_ptr)->dest_h = (dest_h);                         \
    (test_case_ptr)->src_stride = (src_stride);                 \
    (test_case_ptr)->dest_stride = (dest_stride);               \
    (test_case_ptr)->src_unalign_byte = (src_unalign_byte);     \
    (test_case_ptr)->dest_unalign_byte = (dest_unalign_byte);   \
 })
 // ------------------------------------------------ Static variables ---------------------------------------------------
 static const char *TAG_LV_IMAGE_FUNC = "LV Image Functionality";
 static char test_msg_buf[200];
 static const test_matrix_lv_image_params_t default_test_matrix_image_rgb565_blend_rgb565 = {
 #if CONFIG_IDF_TARGET_ESP32S3
    .min_w = 8,                   // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
    .min_h = 1,
    .max_w = 24,
    .max_h = 2,
    .src_max_unalign_byte = 16,   // Use 16-byte boundary check for Xtensa PIE
    .dest_max_unalign_byte = 16,
    .dest_unalign_step = 1,       // Step 1 as the destination array is being aligned in the assembly code all the time
    .src_unalign_step = 3,        // Step 3 (more relaxed) as source array is used unaligned in the assembly code
    .src_stride_step = 3,
    .dest_stride_step = 3,
 #else
    .min_w = 1,
    .min_h = 1,
    .max_w = 16,
    .max_h = 2,
    .src_max_unalign_byte = 4,    // Use 4-byte boundary  check for Xtensa base
    .dest_max_unalign_byte = 4,
    .dest_unalign_step = 1,
    .src_unalign_step = 1,
    .src_stride_step = 1,
    .dest_stride_step = 1,
 #endif
    .src_min_unalign_byte = 0,
    .dest_min_unalign_byte = 0,
    .test_combinations_count = 0,
 };
 // ------------------------------------------------ Static function headers --------------------------------------------
 /**
 * @brief Generate all the functionality test combinations
 *
 * - generate functionality test combinations, based on the provided test_matrix struct
 *
 * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
 * @param[in] test_case Pointer ot structure defining functionality test case
 */
 static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case);
 /**
 * @brief Fill test buffers for image functionality test
 *
 * @param[in] test_case Pointer ot structure defining functionality test case
 */
 static void fill_test_bufs(func_test_case_lv_image_params_t *test_case);
 /**
 * @brief The actual functionality test
 *
 * - function prepares structures for functionality testing and runs the LVGL API
 *
 * @param[in] test_case Pointer ot structure defining functionality test case
 */
 static void lv_image_functionality(func_test_case_lv_image_params_t *test_case);
 /**
 * @brief Evaluate results of LV Image functionality for 16bit data length
 *
 * @param[in] test_case Pointer ot structure defining functionality test case
 */
 static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case);
 // ------------------------------------------------ Test cases ---------------------------------------------------------
 /*
 Functionality tests
 Purpose:
    - Test that an assembly version of LVGL blending API achieves the same results as the ANSI version
 Procedure:
    - Prepare testing matrix, to cover all the possible combinations of destination and source arrays widths,
      lengths, strides and memory alignments
    - Run assembly version of the LVGL blending API
    - Run ANSI C version of the LVGL blending API
    - Compare the results
    - Repeat above 3 steps for each test matrix setup
 */
 // ------------------------------------------------ Test cases stages --------------------------------------------------
 TEST_CASE("LV Image functionality RGB565 blend to RGB565", "[image][functionality][RGB565]")
 {
    test_matrix_lv_image_params_t test_matrix = default_test_matrix_image_rgb565_blend_rgb565;
    func_test_case_lv_image_params_t test_case = {
        .blend_api_func = &lv_draw_sw_blend_image_to_rgb565,
        .color_format = LV_COLOR_FORMAT_RGB565,
        .canary_pixels = CANARY_PIXELS_RGB565,
        .src_data_type_size = sizeof(uint16_t),
        .dest_data_type_size = sizeof(uint16_t),
        .operation_type = OPERATION_FILL,
    };
    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB565 color format");
    functionality_test_matrix(&test_matrix, &test_case);
 }
 // ------------------------------------------------ Static test functions ----------------------------------------------
 static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case)
 {
    // Step destination array width
    for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) {
        // Step destination array height
        for (int dest_h = test_matrix->min_h; dest_h <= test_matrix->max_h; dest_h++) {
            // Step source array stride
            for (int src_stride = dest_w; src_stride <= dest_w * 2; src_stride += test_matrix->src_stride_step) {
                // Step destination array stride
                for (int dest_stride = dest_w; dest_stride <= dest_w * 2; dest_stride += test_matrix->dest_stride_step) {
                    // Step source array unalignment
                    for (int src_unalign_byte = test_matrix->src_min_unalign_byte; src_unalign_byte <= test_matrix->src_max_unalign_byte; src_unalign_byte += test_matrix->src_unalign_step) {
                        // Step destination array unalignment
                        for (int dest_unalign_byte = test_matrix->dest_min_unalign_byte; dest_unalign_byte <= test_matrix->dest_max_unalign_byte; dest_unalign_byte += test_matrix->dest_unalign_step) {
                            // Call functionality test
                            UPDATE_TEST_CASE(test_case, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte);
                            lv_image_functionality(test_case);
                            test_matrix->test_combinations_count++;
                        }
                    }
                }
            }
        }
    }
    ESP_LOGI(TAG_LV_IMAGE_FUNC, "test combinations: %d\n", test_matrix->test_combinations_count);
 }
 static void lv_image_functionality(func_test_case_lv_image_params_t *test_case)
 {
    fill_test_bufs(test_case);
    _lv_draw_sw_blend_image_dsc_t dsc_asm = {
        .dest_buf = test_case->buf.p_dest_asm,
        .dest_w = test_case->dest_w,
        .dest_h = test_case->dest_h,
        .dest_stride = test_case->dest_stride * test_case->dest_data_type_size,  // dest_stride * sizeof(data_type)
        .mask_buf = NULL,
        .mask_stride = 0,
        .src_buf = test_case->buf.p_src,
        .src_stride = test_case->src_stride * test_case->src_data_type_size,     // src_stride * sizeof(data_type)
        .src_color_format = test_case->color_format,
        .opa = LV_OPA_MAX,
        .blend_mode = LV_BLEND_MODE_NORMAL,
        .use_asm = true,
    };
    // Init structure for LVGL blend API, to call the ANSI API
    _lv_draw_sw_blend_image_dsc_t dsc_ansi = dsc_asm;
    dsc_ansi.dest_buf = test_case->buf.p_dest_ansi;
    dsc_ansi.use_asm = false;
    test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
    test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
    // Shift array pointers by (Canary pixels amount * data type length) back
    test_case->buf.p_dest_asm -= test_case->canary_pixels * test_case->dest_data_type_size;
    test_case->buf.p_dest_ansi -= test_case->canary_pixels * test_case->dest_data_type_size;
    // Evaluate the results
    sprintf(test_msg_buf, "Test case: dest_w = %d, dest_h = %d, dest_stride = %d, src_stride = %d, dest_unalign_byte = %d, src_unalign_byte = %d\n",
            test_case->dest_w, test_case->dest_h, test_case->dest_stride, test_case->src_stride, test_case->dest_unalign_byte, test_case->src_unalign_byte);
 #if DBG_PRINT_OUTPUT
    printf("%s\n", test_msg_buf);
 #endif
    switch (test_case->color_format) {
    case LV_COLOR_FORMAT_RGB565:
        test_eval_image_16bit_data(test_case);
        break;
    default:
        TEST_ASSERT_MESSAGE(false, "LV Color format not found");
        break;
    }
    // Free memory allocated for test buffers
    free(test_case->buf.p_dest_asm_alloc);
    free(test_case->buf.p_dest_ansi_alloc);
    free(test_case->buf.p_src_alloc);
 }
 static void fill_test_bufs(func_test_case_lv_image_params_t *test_case)
 {
    const size_t src_data_type_size = test_case->src_data_type_size;        // sizeof() of used data type in the source buffer
    const size_t dest_data_type_size = test_case->dest_data_type_size;      // sizeof() of used data type in the destination buffer
    const size_t src_buf_len = test_case->src_buf_len;                      // Total source buffer length, data part of the source buffer including matrix padding (no Canary pixels are used for source buffer)
    const size_t total_dest_buf_len = test_case->total_dest_buf_len;        // Total destination buffer length, data part of the destination buffer including the Canary pixels and matrix padding
    const size_t active_dest_buf_len = test_case->active_dest_buf_len;      // Length of the data part of the destination buffer including matrix padding
    const size_t canary_pixels = test_case->canary_pixels;                    // Canary pixels, according to the data type
    const unsigned int src_unalign_byte = test_case->src_unalign_byte;      // Unalignment bytes for source buffer
    const unsigned int dest_unalign_byte = test_case->dest_unalign_byte;    // Unalignment bytes for destination buffer
    // Allocate destination arrays and source array for Assembly and ANSI LVGL Blend API
    void *src_mem_common = memalign(16, (src_buf_len * src_data_type_size) + src_unalign_byte);
    void *dest_mem_asm   = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte);
    void *dest_mem_ansi  = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte);
    TEST_ASSERT_NOT_NULL_MESSAGE(src_mem_common, "Lack of memory");
    TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_asm, "Lack of memory");
    TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_ansi, "Lack of memory");
    // Save a pointer to the beginning of the allocated memory which will be used to free()
    test_case->buf.p_src_alloc = src_mem_common;
    test_case->buf.p_dest_asm_alloc = dest_mem_asm;
    test_case->buf.p_dest_ansi_alloc = dest_mem_ansi;
    // Apply destination and source array unalignment
    uint8_t *src_buf_common = (uint8_t *)src_mem_common + src_unalign_byte;
    uint8_t *dest_buf_asm = (uint8_t *)dest_mem_asm + dest_unalign_byte;
    uint8_t *dest_buf_ansi = (uint8_t *)dest_mem_ansi + dest_unalign_byte;
    // Set the whole buffer to 0, including the Canary pixels part
    memset(src_buf_common, 0, src_buf_len * src_data_type_size);
    memset(dest_buf_asm, 0, total_dest_buf_len * src_data_type_size);
    memset(dest_buf_ansi, 0, total_dest_buf_len * src_data_type_size);
    switch (test_case->operation_type) {
    case OPERATION_FILL:
        // Fill the actual part of the destination buffers with known values,
        // Values must be same, because of the stride
        if (test_case->color_format == LV_COLOR_FORMAT_RGB565) {
            uint16_t *dest_buf_asm_uint16 = (uint16_t *)dest_buf_asm;
            uint16_t *dest_buf_ansi_uint16 = (uint16_t *)dest_buf_ansi;
            uint16_t *src_buf_uint16 = (uint16_t *)src_buf_common;
            // Fill destination buffers
            for (int i = 0; i < active_dest_buf_len; i++) {
                dest_buf_asm_uint16[canary_pixels + i] = i + ((i & 1) ? 0x6699 : 0x9966);
                dest_buf_ansi_uint16[canary_pixels + i] = dest_buf_asm_uint16[canary_pixels + i];
            }
            // Fill source buffer
            for (int i = 0; i < src_buf_len; i++) {
                src_buf_uint16[i] = i + ((i & 1) ? 0x55AA : 0xAA55);
            }
        }
        break;
    default:
        TEST_ASSERT_MESSAGE(false, "LV Operation not found");
        break;
    }
    // Shift array pointers by (Canary pixels amount * data type length) forward
    dest_buf_asm += canary_pixels * dest_data_type_size;
    dest_buf_ansi += canary_pixels * dest_data_type_size;
    // Save a pointer to the working part of the memory, where the test data are stored
    test_case->buf.p_src = (void *)src_buf_common;
    test_case->buf.p_dest_asm = (void *)dest_buf_asm;
    test_case->buf.p_dest_ansi = (void *)dest_buf_ansi;
 #if DBG_PRINT_OUTPUT
    printf("Destination buffers fill:\n");
    for (uint32_t i = 0; i < test_case->active_dest_buf_len; i++) {
        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i]);
    }
    printf("\n");
    printf("Source buffer fill:\n");
    for (uint32_t i = 0; i < test_case->src_buf_len; i++) {
        printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]);
    }
    printf("\n");
 #endif
 }
 static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case)
 {
    // Print results, 16bit data
 #if DBG_PRINT_OUTPUT
    printf("\nEval\nDestination buffers fill:\n");
    for (uint32_t i = 0; i < test_case->total_dest_buf_len; i++) {
        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16"   %s \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i], (((uint16_t *)test_case->buf.p_dest_ansi)[i] == ((uint16_t *)test_case->buf.p_dest_asm)[i]) ? ("OK") : ("FAIL"));
    }
    printf("\n");
    printf("Source buffer fill:\n");
    for (uint32_t i = 0; i < test_case->src_buf_len; i++) {
        printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]);
    }
    printf("\n");
 #endif
    // Canary pixels area must stay 0
    const size_t canary_pixels = test_case->canary_pixels;
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi, canary_pixels, test_msg_buf);
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm, canary_pixels, test_msg_buf);
    // dest_buf_asm and dest_buf_ansi must be equal
    TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE((uint16_t *)test_case->buf.p_dest_ansi + canary_pixels, (uint16_t *)test_case->buf.p_dest_asm + canary_pixels, test_case->active_dest_buf_len, test_msg_buf);
    // Data part of the destination buffer and source buffer (not considering matrix padding) must be equal
    uint16_t *dest_row_begin = (uint16_t *)test_case->buf.p_dest_asm + canary_pixels;
    uint16_t *src_row_begin = (uint16_t *)test_case->buf.p_src;
    for (int row = 0; row < test_case->dest_h; row++) {
        TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE(dest_row_begin, src_row_begin, test_case->dest_w, test_msg_buf);
        dest_row_begin += test_case->dest_stride;   // Move pointer of the destination buffer to the next row
        src_row_begin += test_case->src_stride;     // Move pointer of the source buffer to the next row
    }
    // Canary pixels area must stay 0
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf);
    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf);
 }