// Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
// This source file is part of the Cangjie project, licensed under Apache-2.0
// with Runtime Library Exception.
//
// See https://cangjie-lang.cn/pages/LICENSE for license information.

// The Cangjie API is in Beta. For details on its capabilities and limitations, please refer to the README file.

#define CJTHREAD_THREAD_OFFSET      0x10
#define CJTHREAD_CONTEXT_OFFSET     0x18

#define CONTEXT_RSP                 0x00
#define CONTEXT_RBP                 0x08
#define CONTEXT_RBX                 0x10
#define CONTEXT_RIP                 0x18
#define CONTEXT_R12                 0x20
#define CONTEXT_R13                 0x28
#define CONTEXT_R14                 0x30
#define CONTEXT_R15                 0x38
#define CONTEXT_MXCSR               0x40
#define CONTEXT_FPU_CW              0x44

#define THREAD_PROCESSOR_OFFSET     0x38
#define ExclusiveScopeFrameSize     (8 * 22)

// Debug function to print thread context info
    .text
    .global ExclusiveScopeDebugInfo
ExclusiveScopeDebugInfo:
    .seh_proc ExclusiveScopeDebugInfo
    pushq   %rbp
    .seh_pushreg %rbp
    movq    %rsp, %rbp
    .seh_setframe %rbp, 0
    subq    $64, %rsp
    .seh_stackalloc 64
    .seh_endprologue
    
    // Get cjthread
    callq   CJ_CJThreadGetHandle
    testq   %rax, %rax
    jz      .L_debug_no_cjthread
    
    movq    %rax, %rbx              // save cjthread
    
    // Get thread
    movq    CJTHREAD_THREAD_OFFSET(%rax), %r12
    testq   %r12, %r12
    jz      .L_debug_no_thread
    
    // Get thread->context via ExclusiveGetThreadContext
    movq    %r12, %rcx
    callq   ExclusiveGetThreadContext
    testq   %rax, %rax
    jz      .L_debug_no_thread
    movq    %rax, %r13          // &thread->context
    movq    CONTEXT_RSP(%r13), %r14
    
    // Print values (placeholder - would need actual printf)
    // For now, just set breakpoint here and inspect registers
    // r12 = thread pointer
    // r13 = &thread->context
    // r14 = thread->context.rsp
    
    movq    %rbp, %rsp
    popq    %rbp
    ret
    
.L_debug_no_cjthread:
    movq    %rbp, %rsp
    popq    %rbp
    ret
    
.L_debug_no_thread:
    movq    %rbp, %rsp
    popq    %rbp
    ret
    .seh_endproc

// ==============================================================================
// CJ_MCC_ExclusiveScope: Switch from cjthread to OS thread for exclusive execution
// ==============================================================================
//
//   void* CJ_MCC_ExclusiveScope(void* executeClosure, void* closurePtr)
//                                    rcx              rdx

    .text
    .p2align 4, 0x90
    .def CJ_MCC_ExclusiveScope
    .scl 2
    .type 32
    .endef
    .global CJ_MCC_ExclusiveScope
CJ_MCC_ExclusiveScope:
    .seh_proc CJ_MCC_ExclusiveScope
    
    pushq   %rbp
    .seh_pushreg %rbp
    movq    %rsp, %rbp
    .seh_setframe %rbp, 0
    subq    $ExclusiveScopeFrameSize, %rsp
    .seh_stackalloc ExclusiveScopeFrameSize
    .seh_endprologue
    
    // Save callee-saved registers
    movq    %r15, -8(%rbp)
    movq    %r14, -16(%rbp)
    movq    %r13, -24(%rbp)
    movq    %r12, -32(%rbp)
    movq    %rbx, -40(%rbp)
    
    // Save arguments
    movq    %rcx, -48(%rbp)     // executeClosure
    movq    %rdx, -56(%rbp)     // closurePtr
    movq    %r15, -64(%rbp)     // original ThreadLocalData*
    
    // Get oldCJThread and thread
    callq   CJ_CJThreadGetHandle
    testq   %rax, %rax
    jz      .L_exc_no_cjthread
    movq    %rax, -80(%rbp)     // oldCJThread
    
    // thread = oldCJThread->thread
    movq    CJTHREAD_THREAD_OFFSET(%rax), %r12
    testq   %r12, %r12
    jz      .L_exc_no_thread
    movq    %r12, -88(%rbp)     // thread
    
    // Save original processor
    movq    THREAD_PROCESSOR_OFFSET(%r12), %r11
    movq    %r11, -168(%rbp)    // oldProcessor
    
    // MCC_NewExclusiveCJThread(executeClosure, closurePtr, futureTi)
    //                              rcx           rdx        r8
    movq    -48(%rbp), %rcx     // executeClosure
    movq    -56(%rbp), %rdx     // closurePtr
    movq    $0, %r8             // futureTi = 0
    callq   MCC_NewExclusiveCJThread
    testq   %rax, %rax
    jz      .L_exc_create_failed
    movq    %rax, -160(%rbp)    // newCJThread
    
    // Save C2N context with proper unwind stub location
    // This technique ensures unwindPCForExclusiveStub points to the correct PC
    movq    %rbp, %rdx                   // frame address
    // Get the current pc address through the following two assembly instructions.
    call    .L_exc_get_pc_win
.L_exc_get_pc_win:
    .global unwindPCForExclusiveStub
unwindPCForExclusiveStub:
    pop     %rcx                         // PC (return address)
    movq    %r15, %r8                    // r15 (ThreadLocalData)
    callq   MRT_SaveC2NContext
    
    // Enter saferegion
    movq    $0, %rcx
    callq   MRT_EnterSaferegion
    movq    %rax, -96(%rbp)     // save return value
    
    // Save current stack pointer before switch
    movq    %rsp, -104(%rbp)
    
    // Save oldCJThread context
    movq    -80(%rbp), %rax     // oldCJThread
    leaq    CJTHREAD_CONTEXT_OFFSET(%rax), %r14
    
    leaq    8(%rsp), %r11
    movq    %r11, CONTEXT_RSP(%r14)
    movq    %rbp, CONTEXT_RBP(%r14)
    movq    %rbx, CONTEXT_RBX(%r14)
    movq    (%rsp), %r11        // return address
    movq    %r11, CONTEXT_RIP(%r14)
    movq    %r12, CONTEXT_R12(%r14)
    movq    %r13, CONTEXT_R13(%r14)
    movq    -16(%rbp), %r11     // restore r14
    movq    %r11, CONTEXT_R14(%r14)
    movq    -8(%rbp), %r11      // restore r15
    movq    %r11, CONTEXT_R15(%r14)
    stmxcsr CONTEXT_MXCSR(%r14)
    fnstcw  CONTEXT_FPU_CW(%r14)
    
    // Check if already on OS thread stack (nested exclusive scope).
    // If so, skip stack switch to avoid clobbering current frames.
    movq    -80(%rbp), %rcx     // oldCJThread
    callq   IsExclusiveCJThread
    testl   %eax, %eax
    jnz     .L_exc_no_os_stack

    // Get thread context via ExclusiveGetThreadContext (avoid hardcoded offset)
    movq    -88(%rbp), %rcx     // thread
    callq   ExclusiveGetThreadContext
    testq   %rax, %rax
    jz      .L_exc_no_os_stack  // null context -> skip
    movq    %rax, %r15          // r15 = &thread->context
    
    // Check if thread->context.rsp is valid
    movq    CONTEXT_RSP(%r15), %rax
    testq   %rax, %rax
    jz      .L_exc_no_os_stack  // rsp is 0, don't switch stacks
    
    // Switch to OS thread stack
    movq    %rax, %rsp
    ldmxcsr CONTEXT_MXCSR(%r15)
    fldcw   CONTEXT_FPU_CW(%r15)
    
    // Align stack
    subq    $128, %rsp
    andq    $-16, %rsp
    subq    $32, %rsp           // shadow space
    
    // Execute closure: ExclusiveExecutor(thread, newCJThread)
    // Load parameters from stack frame (not registers, as they may be clobbered)
    movq    -88(%rbp), %rcx     // thread
    movq    -160(%rbp), %rdx    // newCJThread
    callq   ExclusiveExecutor
    
    // Save return values
    movq    %rax, %r13
    movapd  %xmm0, -128(%rbp)
    movapd  %xmm1, -144(%rbp)
    
    // Switch back to cjthread stack
    movq    -104(%rbp), %rsp
    
    // ExclusiveRestore(oldCJThread, thread, newCJThread, oldProcessor)
    // Load all parameters from stack frame
    movq    -80(%rbp), %rcx     // oldCJThread
    movq    -88(%rbp), %rdx     // thread
    movq    -160(%rbp), %r8     // newCJThread
    movq    -168(%rbp), %r9     // oldProcessor
    callq   ExclusiveRestore
    
    // Save return values to stack
    movq    %r13, -112(%rbp)
    // xmm0/xmm1 already saved to -128(%rbp)/-144(%rbp)
    jmp     .L_exc_continue

.L_exc_no_os_stack:
    // No valid OS stack - execute on current cjthread stack
    // This happens when ScheduleStart was never called
    movq    -88(%rbp), %rcx     // thread
    movq    -160(%rbp), %rdx    // newCJThread
    subq    $32, %rsp
    callq   ExclusiveExecutor
    addq    $32, %rsp
    
    movq    %rax, -112(%rbp)
    movapd  %xmm0, -128(%rbp)
    movapd  %xmm1, -144(%rbp)
    
    movq    -80(%rbp), %rcx     // oldCJThread
    movq    -88(%rbp), %rdx     // thread
    movq    -160(%rbp), %r8     // newCJThread
    movq    -168(%rbp), %r9     // oldProcessor
    subq    $32, %rsp
    callq   ExclusiveRestore
    addq    $32, %rsp

.L_exc_continue:
    
    // Leave saferegion
    movq    -96(%rbp), %rax
    cmpq    $0, %rax
    je      .L_exc_skip_leave
    callq   MRT_LeaveSaferegion
    callq   MRT_GetThreadLocalData
    movq    %rax, %r13
    jmp     .L_exc_do_delete
.L_exc_skip_leave:
    movq    -64(%rbp), %r13
.L_exc_do_delete:
    // Delete C2N context
    movq    %r13, %rcx
    callq   MRT_DeleteC2NContext
    
    // Restore return values
    movq    -112(%rbp), %rax
    movapd  -128(%rbp), %xmm0
    movapd  -144(%rbp), %xmm1
    
    jmp     .L_exc_cleanup

// Exception paths
.L_exc_create_failed:
.L_exc_no_cjthread:
.L_exc_no_thread:
    xorq    %rax, %rax

.L_exc_cleanup:
    // Restore callee-saved registers
    movq    -8(%rbp), %r15
    movq    -16(%rbp), %r14
    movq    -24(%rbp), %r13
    movq    -32(%rbp), %r12
    movq    -40(%rbp), %rbx
    
    // Restore stack and return
    movq    %rbp, %rsp
    popq    %rbp
    ret
    .seh_endproc

// ==============================================================================
// ExecuteExclusiveCangjieStub
// ==============================================================================
    .global ExecuteExclusiveCangjieStub
ExecuteExclusiveCangjieStub:
    .seh_proc ExecuteExclusiveCangjieStub
    
    pushq   %rbp
    .seh_pushreg %rbp
    movq    %rsp, %rbp
    .seh_setframe %rbp, 0
    subq    $128, %rsp          // 80 bytes locals + 32 bytes shadow space + 16 bytes alignment
    .seh_stackalloc 128
    .seh_endprologue
    
    // Save callee-saved registers
    movq    %r15, -8(%rbp)
    movq    %r14, -16(%rbp)
    movq    %r13, -24(%rbp)
    movq    %r12, -32(%rbp)
    movq    %rbx, -40(%rbp)
    
    // Windows x64: rcx=sret, rdx=arg1, r8=closureObj, r9=executeClosure, stack=[threadData]
    movq    %rcx, -48(%rbp)     // sret
    movq    %r8, -56(%rbp)      // closureObj
    movq    %r9, -64(%rbp)      // executeClosure
    movq    48(%rbp), %rax      // threadData (5th param on stack)
    movq    %rax, -80(%rbp)
    
    // Update r15 with threadData
    movq    %rax, %r15
    
    // SaveTopManagedContext
    movq    %rbp, %rcx
    callq   MRT_SaveTopManagedContextToN2CStub
    
    // Get closure function pointer
    movq    -64(%rbp), %r11     // executeClosure
    movq    8(%r11), %r11       // func = executeClosure->vtable[1]
    
    // Call closure: func(sret, closureObj)
    movq    -48(%rbp), %rcx     // sret
    movq    -56(%rbp), %rdx     // closureObj
    callq   *%r11
    
    .global unwindPCForExclusiveStubFull
unwindPCForExclusiveStubFull:
    // RestoreTopManagedContext
    movq    %rbp, %rcx
    callq   MRT_RestoreTopManagedContextFromN2CStub
    // Restore callee-saved registers
    movq    -8(%rbp), %r15
    movq    -16(%rbp), %r14
    movq    -24(%rbp), %r13
    movq    -32(%rbp), %r12
    movq    -40(%rbp), %rbx
    
    movq    %rbp, %rsp
    popq    %rbp
    ret
    .seh_endproc