// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#include "jitpch.h"
#include <minipal/utf8.h>
#ifdef _MSC_VER
#pragma hdrstop
#endif

// Save expression to a local and append it as the last statement in exprBlock
static GenTree* SpillExpression(Compiler* comp, GenTree* expr, BasicBlock* exprBlock, DebugInfo& debugInfo)
{
    unsigned const tmpNum = comp->lvaGrabTemp(true DEBUGARG("spilling expr"));
    Statement*     stmt   = comp->fgNewStmtAtEnd(exprBlock, comp->gtNewTempStore(tmpNum, expr), debugInfo);
    comp->gtSetStmtInfo(stmt);
    comp->fgSetStmtSeq(stmt);

    return comp->gtNewLclVarNode(tmpNum);
};

static void InheritFlags(BasicBlock* dst, const BasicBlock* flagSrc)
{
    // Currently, all of our "new block" helpers set BBF_INTERNAL flag by default.
    if (!flagSrc->HasFlag(BBF_INTERNAL))
    {
        dst->RemoveFlags(BBF_INTERNAL);
        dst->SetFlags(BBF_IMPORTED);
    }
}

//------------------------------------------------------------------------------
// SplitAtTreeAndReplaceItWithLocal : Split block at the given tree and replace it with a local
//    See comments in gtSplitTree and fgSplitBlockBeforeTree
//    TODO: use this function in more places in this file.
//
// Arguments:
//    comp        - Compiler instance
//    block       - Block to split
//    stmt        - Statement containing the tree to split at
//    tree        - Tree to split at
//    topBlock    - [out] Top block after the split
//    bottomBlock - [out] Bottom block after the split
//
// Return Value:
//    Number of the local that replaces the tree
//
static unsigned SplitAtTreeAndReplaceItWithLocal(
    Compiler* comp, BasicBlock* block, Statement* stmt, GenTree* tree, BasicBlock** topBlock, BasicBlock** bottomBlock)
{
    BasicBlock* prevBb       = block;
    GenTree**   callUse      = nullptr;
    Statement*  newFirstStmt = nullptr;
    block                    = comp->fgSplitBlockBeforeTree(block, stmt, tree, &newFirstStmt, &callUse);
    assert(prevBb != nullptr && block != nullptr);

    // Block ops inserted by the split need to be morphed here since we are after morph.
    // We cannot morph stmt yet as we may modify it further below, and the morphing
    // could invalidate callUse
    while ((newFirstStmt != nullptr) && (newFirstStmt != stmt))
    {
        comp->fgMorphStmtBlockOps(block, newFirstStmt);
        newFirstStmt = newFirstStmt->GetNextStmt();
    }

    // Grab a temp to store the result.
    const unsigned tmpNum         = comp->lvaGrabTemp(true DEBUGARG("replacement local"));
    comp->lvaTable[tmpNum].lvType = tree->TypeGet();

    // Replace the original call with that temp
    *callUse = comp->gtNewLclvNode(tmpNum, tree->TypeGet());

    comp->fgMorphStmtBlockOps(block, stmt);
    comp->gtUpdateStmtSideEffects(stmt);

    *topBlock    = prevBb;
    *bottomBlock = block;
    return tmpNum;
}

//------------------------------------------------------------------------------
// gtNewRuntimeLookupHelperCallNode : Helper to create a runtime lookup call helper node.
//
// Arguments:
//    helper    - Call helper
//    type      - Type of the node
//    args      - Call args
//
// Return Value:
//    New CT_HELPER node
//
GenTreeCall* Compiler::gtNewRuntimeLookupHelperCallNode(CORINFO_RUNTIME_LOOKUP* pRuntimeLookup,
                                                        GenTree*                ctxTree,
                                                        void*                   compileTimeHandle)
{
    // Call the helper
    // - Setup argNode with the pointer to the signature returned by the lookup
    GenTree* argNode = gtNewIconEmbHndNode(pRuntimeLookup->signature, nullptr, GTF_ICON_GLOBAL_PTR, compileTimeHandle);
    GenTreeCall* helperCall = gtNewHelperCallNode(pRuntimeLookup->helper, TYP_I_IMPL, ctxTree, argNode);

    // No need to perform CSE/hoisting for signature node - it is expected to end up in a rarely-taken block after
    // "Expand runtime lookups" phase.
    argNode->gtFlags |= GTF_DONT_CSE;

    // Leave a note that this method has runtime lookups we might want to expand (nullchecks, size checks) later.
    // We can also consider marking current block as a runtime lookup holder to improve TP for Tier0
    impInlineRoot()->setMethodHasExpRuntimeLookup();
    if (!impInlineRoot()->GetSignatureToLookupInfoMap()->Lookup(pRuntimeLookup->signature))
    {
        JITDUMP("Registering %p in SignatureToLookupInfoMap\n", pRuntimeLookup->signature)
        impInlineRoot()->GetSignatureToLookupInfoMap()->Set(pRuntimeLookup->signature, *pRuntimeLookup);
    }
    return helperCall;
}

//------------------------------------------------------------------------------
// fgExpandRuntimeLookups : partially expand runtime lookups helper calls
//                          to add a nullcheck [+ size check] and a fast path
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
// Notes:
//    The runtime lookup itself is needed to access a handle in code shared between
//    generic instantiations. The lookup depends on the typeContext which is only available at
//    runtime, and not at compile - time. See ASCII block diagrams in comments below for
//    better understanding how this phase expands runtime lookups.
//
PhaseStatus Compiler::fgExpandRuntimeLookups()
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;

    if (!doesMethodHaveExpRuntimeLookup())
    {
        // The method being compiled doesn't have expandable runtime lookups. If it does
        // and doesMethodHaveExpRuntimeLookup() still returns false we'll assert in LowerCall
        return result;
    }

    return fgExpandHelper<&Compiler::fgExpandRuntimeLookupsForCall>(false);
}

//------------------------------------------------------------------------------
// fgExpandRuntimeLookupsForCall : partially expand runtime lookups helper calls
//    to add a nullcheck [+ size check] and a fast path
//
// Arguments:
//    pBlock - Block containing the helper call to expand. If expansion is performed,
//             this is updated to the new block that was an outcome of block splitting.
//    stmt   - Statement containing the helper call
//    call   - The helper call
//
// Returns:
//    true if a runtime lookup was found and expanded.
//
// Notes:
//    The runtime lookup itself is needed to access a handle in code shared between
//    generic instantiations. The lookup depends on the typeContext which is only available at
//    runtime, and not at compile - time. See ASCII block diagrams in comments below for
//    better understanding how this phase expands runtime lookups.
//
bool Compiler::fgExpandRuntimeLookupsForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    BasicBlock* block = *pBlock;

    if (!call->IsRuntimeLookupHelperCall(this))
    {
        return false;
    }

    INDEBUG(call->gtCallDebugFlags |= GTF_CALL_MD_RUNTIME_LOOKUP_EXPANDED);

    if (call->IsTailCall())
    {
        // It is very unlikely to happen and is impossible to represent in C#
        return false;
    }

    assert(call->gtArgs.CountArgs() == 2);
    // The call has the following signature:
    //
    //   type = call(genericCtx, signatureCns);
    //
    const GenTree* signatureNode = call->gtArgs.GetArgByIndex(1)->GetNode();
    if (!signatureNode->IsCnsIntOrI())
    {
        // We expect the signature to be a constant node here (it's marked as DONT_CSE)
        // It's still correct if JIT decides to violate this assumption, but we really
        // don't want it to happen, hence, the assert.
        assert(!"can't restore signature argument value");
        return false;
    }
    void* signature = reinterpret_cast<void*>(signatureNode->AsIntCon()->IconValue());

    JITDUMP("Expanding runtime lookup for [%06d] in " FMT_BB ":\n", dspTreeID(call), block->bbNum)
    DISPTREE(call)
    JITDUMP("\n")

    // Restore runtimeLookup using signature argument via a global dictionary
    CORINFO_RUNTIME_LOOKUP runtimeLookup = {};
    const bool             lookupFound   = GetSignatureToLookupInfoMap()->Lookup(signature, &runtimeLookup);
    assert(lookupFound);

    const bool needsSizeCheck = runtimeLookup.sizeOffset != CORINFO_NO_SIZE_CHECK;
    if (needsSizeCheck)
    {
        JITDUMP("dynamic expansion, needs size check.\n")
    }

    DebugInfo debugInfo = stmt->GetDebugInfo();

    assert(runtimeLookup.indirections != 0);
    assert(runtimeLookup.testForNull);

    // Split block right before the call tree
    BasicBlock* prevBb       = block;
    GenTree**   callUse      = nullptr;
    Statement*  newFirstStmt = nullptr;
    block                    = fgSplitBlockBeforeTree(block, stmt, call, &newFirstStmt, &callUse);
    *pBlock                  = block;
    assert(prevBb != nullptr && block != nullptr);

    // Block ops inserted by the split need to be morphed here since we are after morph.
    // We cannot morph stmt yet as we may modify it further below, and the morphing
    // could invalidate callUse.
    while ((newFirstStmt != nullptr) && (newFirstStmt != stmt))
    {
        fgMorphStmtBlockOps(block, newFirstStmt);
        newFirstStmt = newFirstStmt->GetNextStmt();
    }

    GenTreeLclVar* rtLookupLcl = nullptr;

    // Mostly for Tier0: if the current statement is STORE_LCL_VAR(RuntimeLookup)
    // we can drop it and use that LCL as the destination
    if (stmt->GetRootNode()->OperIs(GT_STORE_LCL_VAR) && (stmt->GetRootNode()->AsLclVar()->Data() == *callUse))
    {
        rtLookupLcl = gtNewLclVarNode(stmt->GetRootNode()->AsLclVar()->GetLclNum());
        fgRemoveStmt(block, stmt);
    }

    // Grab a temp to store result (it's assigned from either fastPathBb or fallbackBb)
    if (rtLookupLcl == nullptr)
    {
        // Define a local for the result
        unsigned rtLookupLclNum         = lvaGrabTemp(true DEBUGARG("runtime lookup"));
        lvaTable[rtLookupLclNum].lvType = TYP_I_IMPL;
        rtLookupLcl                     = gtNewLclvNode(rtLookupLclNum, call->TypeGet());

        *callUse = gtClone(rtLookupLcl);

        fgMorphStmtBlockOps(block, stmt);
        gtUpdateStmtSideEffects(stmt);
    }

    GenTree* ctxTree = call->gtArgs.GetArgByIndex(0)->GetNode();

    // Prepare slotPtr tree (TODO: consider sharing this part with impRuntimeLookup)
    GenTree* slotPtrTree   = gtCloneExpr(ctxTree);
    GenTree* indOffTree    = nullptr;
    GenTree* lastIndOfTree = nullptr;
    for (WORD i = 0; i < runtimeLookup.indirections; i++)
    {
        if ((i == 1 && runtimeLookup.indirectFirstOffset) || (i == 2 && runtimeLookup.indirectSecondOffset))
        {
            indOffTree  = SpillExpression(this, slotPtrTree, prevBb, debugInfo);
            slotPtrTree = gtCloneExpr(indOffTree);
        }

        // The last indirection could be subject to a size check (dynamic dictionary expansion)
        const bool isLastIndirectionWithSizeCheck = (i == runtimeLookup.indirections - 1) && needsSizeCheck;
        if (i != 0)
        {
            GenTreeFlags indirFlags = GTF_IND_NONFAULTING;
            if (!isLastIndirectionWithSizeCheck)
            {
                indirFlags |= GTF_IND_INVARIANT;
            }
            slotPtrTree = gtNewIndir(TYP_I_IMPL, slotPtrTree, indirFlags);
        }

        if ((i == 1 && runtimeLookup.indirectFirstOffset) || (i == 2 && runtimeLookup.indirectSecondOffset))
        {
            slotPtrTree = gtNewOperNode(GT_ADD, TYP_I_IMPL, indOffTree, slotPtrTree);
        }
        if (runtimeLookup.offsets[i] != 0)
        {
            if (isLastIndirectionWithSizeCheck)
            {
                lastIndOfTree = SpillExpression(this, slotPtrTree, prevBb, debugInfo);
                slotPtrTree   = gtCloneExpr(lastIndOfTree);
            }
            slotPtrTree =
                gtNewOperNode(GT_ADD, TYP_I_IMPL, slotPtrTree, gtNewIconNode(runtimeLookup.offsets[i], TYP_I_IMPL));
        }
    }

    // Non-dynamic expansion case (no size check):
    //
    // prevBb(BBJ_ALWAYS):                  [weight: 1.0]
    //     ...
    //
    // nullcheckBb(BBJ_COND):               [weight: 1.0]
    //     if (*fastPathValue == null)
    //         goto fallbackBb;
    //
    // fastPathBb(BBJ_ALWAYS):              [weight: 0.8]
    //     rtLookupLcl = *fastPathValue;
    //     goto block;
    //
    // fallbackBb(BBJ_ALWAYS):              [weight: 0.2]
    //     rtLookupLcl = HelperCall();
    //
    // block(...):                          [weight: 1.0]
    //     use(rtLookupLcl);
    //

    // null-check basic block
    GenTree* fastPathValue = gtNewIndir(TYP_I_IMPL, gtCloneExpr(slotPtrTree), GTF_IND_NONFAULTING);
    // Save dictionary slot to a local (to be used by fast path)
    GenTree* fastPathValueClone = fgMakeMultiUse(&fastPathValue);
    GenTree* nullcheckOp        = gtNewOperNode(GT_EQ, TYP_INT, fastPathValue, gtNewIconNode(0, TYP_I_IMPL));
    nullcheckOp->gtFlags |= GTF_RELOP_JMP_USED;

    // nullcheckBb conditionally jumps to fallbackBb, but we need to initialize fallbackBb last
    // so we can place it after nullcheckBb. So set the jump target later.
    BasicBlock* nullcheckBb =
        fgNewBBFromTreeAfter(BBJ_COND, prevBb, gtNewOperNode(GT_JTRUE, TYP_VOID, nullcheckOp), debugInfo);

    // Fallback basic block
    GenTree*    fallbackValueDef = gtNewStoreLclVarNode(rtLookupLcl->GetLclNum(), call);
    BasicBlock* fallbackBb       = fgNewBBFromTreeAfter(BBJ_ALWAYS, nullcheckBb, fallbackValueDef, debugInfo, true);

    // Fast-path basic block
    GenTree*    fastpathValueDef = gtNewStoreLclVarNode(rtLookupLcl->GetLclNum(), fastPathValueClone);
    BasicBlock* fastPathBb       = fgNewBBFromTreeAfter(BBJ_ALWAYS, nullcheckBb, fastpathValueDef, debugInfo);

    BasicBlock* sizeCheckBb = nullptr;
    if (needsSizeCheck)
    {
        // Dynamic expansion case (sizeCheckBb is added and some preds are changed):
        //
        // prevBb(BBJ_ALWAYS):                  [weight: 1.0]
        //
        // sizeCheckBb(BBJ_COND):               [weight: 1.0]
        //     if (sizeValue <= offsetValue)
        //         goto fallbackBb;
        //     ...
        //
        // nullcheckBb(BBJ_COND):               [weight: 0.8]
        //     if (*fastPathValue == null)
        //         goto fallbackBb;
        //
        // fastPathBb(BBJ_ALWAYS):              [weight: 0.64]
        //     rtLookupLcl = *fastPathValue;
        //     goto block;
        //
        // fallbackBb(BBJ_ALWAYS):              [weight: 0.36]
        //     rtLookupLcl = HelperCall();
        //
        // block(...):                          [weight: 1.0]
        //     use(rtLookupLcl);
        //

        // sizeValue = dictionary[pRuntimeLookup->sizeOffset]
        GenTreeIntCon* sizeOffset = gtNewIconNode(runtimeLookup.sizeOffset, TYP_I_IMPL);
        assert(lastIndOfTree != nullptr);
        GenTree* sizeValueOffset = gtNewOperNode(GT_ADD, TYP_I_IMPL, lastIndOfTree, sizeOffset);
        GenTree* sizeValue       = gtNewIndir(TYP_I_IMPL, sizeValueOffset, GTF_IND_NONFAULTING);

        // sizeCheck fails if sizeValue <= pRuntimeLookup->offsets[i]
        GenTree* offsetValue = gtNewIconNode(runtimeLookup.offsets[runtimeLookup.indirections - 1], TYP_I_IMPL);
        GenTree* sizeCheck   = gtNewOperNode(GT_LE, TYP_INT, sizeValue, offsetValue);
        sizeCheck->gtFlags |= GTF_RELOP_JMP_USED;

        GenTree* jtrue = gtNewOperNode(GT_JTRUE, TYP_VOID, sizeCheck);
        // sizeCheckBb fails - jump to fallbackBb
        sizeCheckBb = fgNewBBFromTreeAfter(BBJ_COND, prevBb, jtrue, debugInfo);
    }

    //
    // Update preds in all new blocks
    //
    assert(prevBb->KindIs(BBJ_ALWAYS));

    {
        FlowEdge* const newEdge = fgAddRefPred(block, fastPathBb);
        fastPathBb->SetTargetEdge(newEdge);
    }

    {
        FlowEdge* const newEdge = fgAddRefPred(block, fallbackBb);
        fallbackBb->SetTargetEdge(newEdge);
        assert(fallbackBb->JumpsToNext());
    }

    if (needsSizeCheck)
    {
        // sizeCheckBb is the first block after prevBb
        fgRedirectEdge(prevBb->TargetEdgeRef(), sizeCheckBb);

        // sizeCheckBb flows into nullcheckBb in case if the size check passes
        {
            FlowEdge* const trueEdge  = fgAddRefPred(fallbackBb, sizeCheckBb);
            FlowEdge* const falseEdge = fgAddRefPred(nullcheckBb, sizeCheckBb);
            sizeCheckBb->SetTrueEdge(trueEdge);
            sizeCheckBb->SetFalseEdge(falseEdge);
            trueEdge->setLikelihood(0.2);
            falseEdge->setLikelihood(0.8);
        }

        // fallbackBb is reachable from both nullcheckBb and sizeCheckBb
        // fastPathBb is only reachable from successful nullcheckBb
    }
    else
    {
        // nullcheckBb is the first block after prevBb
        fgRedirectEdge(prevBb->TargetEdgeRef(), nullcheckBb);

        // No size check, nullcheckBb jumps to fast path
        // fallbackBb is only reachable from nullcheckBb (jump destination)
    }

    FlowEdge* const trueEdge  = fgAddRefPred(fallbackBb, nullcheckBb);
    FlowEdge* const falseEdge = fgAddRefPred(fastPathBb, nullcheckBb);
    nullcheckBb->SetTrueEdge(trueEdge);
    nullcheckBb->SetFalseEdge(falseEdge);
    trueEdge->setLikelihood(0.2);
    falseEdge->setLikelihood(0.8);

    //
    // Re-distribute weights (see '[weight: X]' on the diagrams above)
    // TODO: consider marking fallbackBb as rarely-taken
    // TODO: derive block weights from edge likelihoods.
    //
    block->inheritWeight(prevBb);
    if (needsSizeCheck)
    {
        sizeCheckBb->inheritWeight(prevBb);
        // 80% chance we pass nullcheck
        nullcheckBb->inheritWeightPercentage(sizeCheckBb, 80);
        // 64% (0.8 * 0.8) chance we pass both nullcheck and sizecheck
        fastPathBb->inheritWeightPercentage(nullcheckBb, 80);
        // 100-64=36% chance we fail either nullcheck or sizecheck
        fallbackBb->inheritWeightPercentage(sizeCheckBb, 36);
    }
    else
    {
        nullcheckBb->inheritWeight(prevBb);
        // 80% chance we pass nullcheck
        fastPathBb->inheritWeightPercentage(nullcheckBb, 80);
        // 20% chance we fail nullcheck (TODO: Consider making it cold (0%))
        fallbackBb->inheritWeightPercentage(nullcheckBb, 20);
    }

    //
    // Update flags in all new blocks
    //

    InheritFlags(nullcheckBb, prevBb);
    InheritFlags(fastPathBb, prevBb);
    InheritFlags(fallbackBb, prevBb);
    InheritFlags(block, prevBb);
    if (needsSizeCheck)
    {
        InheritFlags(sizeCheckBb, prevBb);
    }

    // All blocks are expected to be in the same EH region
    assert(BasicBlock::sameEHRegion(prevBb, block));
    assert(BasicBlock::sameEHRegion(prevBb, nullcheckBb));
    assert(BasicBlock::sameEHRegion(prevBb, fastPathBb));
    if (needsSizeCheck)
    {
        assert(BasicBlock::sameEHRegion(prevBb, sizeCheckBb));
    }
    return true;
}

//------------------------------------------------------------------------------
// fgExpandThreadLocalAccess: Inline the CORINFO_HELP_GETDYNAMIC_NONGCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED,
//      CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED, or
//      CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED2 helper. See
//      fgExpandThreadLocalAccessForCall for details.
//
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
PhaseStatus Compiler::fgExpandThreadLocalAccess()
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;

    if (!methodHasTlsFieldAccess())
    {
        // TP: nothing to expand in the current method
        JITDUMP("Nothing to expand.\n")
        return result;
    }

    // Always expand for NativeAOT because the slow TLS access helper will not be generated by NativeAOT
    const bool isNativeAOT = IsTargetAbi(CORINFO_NATIVEAOT_ABI);
    if (isNativeAOT)
    {
        return fgExpandHelper<&Compiler::fgExpandThreadLocalAccessForCallNativeAOT>(
            false /* expand rarely run blocks for NativeAOT */);
    }

    if (opts.OptimizationDisabled())
    {
        JITDUMP("Optimizations aren't allowed - bail out.\n")
        return result;
    }

    // TODO: Replace with opts.compCodeOpt once it's fixed
    const bool preferSize = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_SIZE_OPT);
    if (preferSize)
    {
        // The optimization comes with a codegen size increase
        JITDUMP("Optimized for size - bail out.\n")
        return result;
    }

    return fgExpandHelper<&Compiler::fgExpandThreadLocalAccessForCall>(true);
}

//------------------------------------------------------------------------------
// fgExpandThreadLocalAccessForCallNativeAOT : Expand the access of tlsRoot needed
//  to access fields marked with [ThreadLocal].
//
// Arguments:
//    pBlock - Block containing the helper call to expand. If expansion is performed,
//             this is updated to the new block that was an outcome of block splitting.
//    stmt   - Statement containing the helper call
//    call   - The helper call
//
//
// Returns:
//    true if we expanded any field access, false otherwise.
//
bool Compiler::fgExpandThreadLocalAccessForCallNativeAOT(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    assert(IsTargetAbi(CORINFO_NATIVEAOT_ABI));
    BasicBlock*     block  = *pBlock;
    CorInfoHelpFunc helper = call->GetHelperNum();

    const bool isExpTLSFieldAccess = (helper == CORINFO_HELP_READYTORUN_THREADSTATIC_BASE_NOCTOR);
    if (!call->IsHelperCall() || !isExpTLSFieldAccess)
    {
        return false;
    }

    JITDUMP("Expanding thread static local access for [%06d] in " FMT_BB ":\n", dspTreeID(call), block->bbNum);
    DISPTREE(call);
    JITDUMP("\n");

    CORINFO_THREAD_STATIC_INFO_NATIVEAOT threadStaticInfo;
    memset(&threadStaticInfo, 0, sizeof(CORINFO_THREAD_STATIC_INFO_NATIVEAOT));

    info.compCompHnd->getThreadLocalStaticInfo_NativeAOT(&threadStaticInfo);

    JITDUMP("tlsRootObject= %p\n", dspPtr(threadStaticInfo.tlsRootObject.addr));
    JITDUMP("tlsIndexObject= %p\n", dspPtr(threadStaticInfo.tlsIndexObject.addr));
    JITDUMP("offsetOfThreadLocalStoragePointer= %u\n", dspOffset(threadStaticInfo.offsetOfThreadLocalStoragePointer));
    JITDUMP("threadStaticBaseSlow= %p\n", dspPtr(threadStaticInfo.threadStaticBaseSlow.addr));

    // Split block right before the call tree
    BasicBlock* prevBb       = block;
    GenTree**   callUse      = nullptr;
    Statement*  newFirstStmt = nullptr;
    DebugInfo   debugInfo    = stmt->GetDebugInfo();
    block                    = fgSplitBlockBeforeTree(block, stmt, call, &newFirstStmt, &callUse);
    *pBlock                  = block;
    var_types callType       = call->TypeGet();
    assert(prevBb != nullptr && block != nullptr);

    unsigned finalLclNum = lvaGrabTemp(true DEBUGARG("Final offset"));
    // Note, `tlsRoot` refers to the TLS blob object, which is an unpinned managed object,
    // thus the type of the local is TYP_REF
    lvaTable[finalLclNum].lvType = TYP_REF;
    GenTree* finalLcl            = gtNewLclVarNode(finalLclNum);

    // Block ops inserted by the split need to be morphed here since we are after morph.
    // We cannot morph stmt yet as we may modify it further below, and the morphing
    // could invalidate callUse.
    while ((newFirstStmt != nullptr) && (newFirstStmt != stmt))
    {
        fgMorphStmtBlockOps(block, newFirstStmt);
        newFirstStmt = newFirstStmt->GetNextStmt();
    }

#ifdef TARGET_64BIT
    // prevBb (BBJ_NONE):                                               [weight: 1.0]
    //      ...
    //
    // tlsRootNullCondBB (BBJ_COND):                                    [weight: 1.0]
    //      fastPathValue = [tlsRootAddress]
    //      if (fastPathValue != nullptr)
    //          goto fastPathBb;
    //
    // fallbackBb (BBJ_ALWAYS):                                         [weight: 0]
    //      tlsRoot = HelperCall();
    //      goto block;
    //
    // fastPathBb(BBJ_ALWAYS):                                          [weight: 1.0]
    //      tlsRoot = fastPathValue;
    //
    // block (...):                                                     [weight: 1.0]
    //      use(tlsRoot);
    // ...

    GenTree*               tlsRootAddr   = nullptr;
    CORINFO_GENERIC_HANDLE tlsRootObject = threadStaticInfo.tlsRootObject.handle;

    if (TargetOS::IsWindows)
    {
        // Mark this ICON as a TLS_HDL, codegen will use FS:[cns] or GS:[cns]
        GenTree* tlsValue = gtNewIconHandleNode(threadStaticInfo.offsetOfThreadLocalStoragePointer, GTF_ICON_TLS_HDL);
        tlsValue          = gtNewIndir(TYP_I_IMPL, tlsValue, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        CORINFO_CONST_LOOKUP tlsIndexObject = threadStaticInfo.tlsIndexObject;

        GenTree* dllRef = gtNewIconHandleNode((size_t)tlsIndexObject.handle, GTF_ICON_CONST_PTR);
        dllRef          = gtNewIndir(TYP_INT, dllRef, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);
        dllRef          = gtNewCastNode(TYP_I_IMPL, dllRef, true, TYP_I_IMPL);
        dllRef          = gtNewOperNode(GT_LSH, TYP_I_IMPL, dllRef, gtNewIconNode(3, TYP_I_IMPL));

        // Add the dllRef to produce thread local storage reference for coreclr
        tlsValue = gtNewOperNode(GT_ADD, TYP_I_IMPL, tlsValue, dllRef);

        // Base of coreclr's thread local storage
        tlsValue = gtNewIndir(TYP_I_IMPL, tlsValue, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        // This resolves to an offset which is TYP_INT
        GenTree* tlsRootOffset = gtNewIconNode((size_t)tlsRootObject, TYP_INT);
        tlsRootOffset->gtFlags |= GTF_ICON_SECREL_OFFSET;

        // Add the tlsValue and tlsRootOffset to produce tlsRootAddr.
        tlsRootAddr = gtNewOperNode(GT_ADD, TYP_I_IMPL, tlsValue, tlsRootOffset);
    }
    else if (TargetOS::IsUnix)
    {
        if (TargetArchitecture::IsX64)
        {
            // Code sequence to access thread local variable on linux/x64:
            //      data16
            //      lea      rdi, 0x7FE5C418CD28  ; tlsRootObject
            //      data16 data16
            //      call     _tls_get_addr
            //
            // This sequence along with `data16` prefix is expected by the linker so it
            // will patch these with TLS access.
            GenTree* tls_get_addr_val =
                gtNewIconHandleNode((size_t)threadStaticInfo.tlsGetAddrFtnPtr.handle, GTF_ICON_FTN_ADDR);
            tls_get_addr_val->SetContained();

            GenTreeCall* tlsRefCall = gtNewIndCallNode(tls_get_addr_val, TYP_I_IMPL);
            tlsRefCall->gtFlags |= GTF_TLS_GET_ADDR;

            // This is an indirect call which takes an argument.
            // Populate and set the ABI appropriately.
            assert(tlsRootObject != 0);
            GenTree* tlsArg = gtNewIconNode((size_t)tlsRootObject, TYP_I_IMPL);
            tlsArg->gtFlags |= GTF_ICON_TLSGD_OFFSET;
            tlsRefCall->gtArgs.PushBack(this, NewCallArg::Primitive(tlsArg));

            fgMorphArgs(tlsRefCall);

            tlsRefCall->gtFlags |= GTF_EXCEPT | (tls_get_addr_val->gtFlags & GTF_GLOB_EFFECT);
            tlsRootAddr = tlsRefCall;
        }
        else if (TargetArchitecture::IsArm64)
        {
            /*
            x0 = adrp :tlsdesc:tlsRoot ; 1st parameter
            x0 += tlsdesc_lo12:tlsRoot ; update 1st parameter

            x1 = tpidr_el0             ; 2nd parameter

            x2 = [x0]                  ; call
            blr x2

            */

            GenTree* tlsRootOffset = gtNewIconHandleNode((size_t)tlsRootObject, GTF_ICON_TLS_HDL);
            tlsRootOffset->gtFlags |= GTF_ICON_TLSGD_OFFSET;

            GenTree*     tlsCallIndir = gtCloneExpr(tlsRootOffset);
            GenTreeCall* tlsRefCall   = gtNewIndCallNode(tlsCallIndir, TYP_I_IMPL);
            tlsRefCall->gtFlags |= GTF_TLS_GET_ADDR;
            fgMorphArgs(tlsRefCall);

            tlsRefCall->gtFlags |= GTF_EXCEPT | (tlsCallIndir->gtFlags & GTF_GLOB_EFFECT);
            tlsRootAddr = tlsRefCall;
        }
        else
        {
            unreached();
        }
    }
    else
    {
        unreached();
    }

    // Cache the TlsRootAddr value
    unsigned tlsRootAddrLclNum         = lvaGrabTemp(true DEBUGARG("TlsRootAddr access"));
    lvaTable[tlsRootAddrLclNum].lvType = TYP_I_IMPL;
    GenTree* tlsRootAddrDef            = gtNewStoreLclVarNode(tlsRootAddrLclNum, tlsRootAddr);
    GenTree* tlsRootAddrUse            = gtNewLclVarNode(tlsRootAddrLclNum);

    // See comments near finalLclNum above regarding TYP_REF
    GenTree* tlsRootVal = gtNewIndir(TYP_REF, tlsRootAddrUse, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

    GenTree* tlsRootDef = gtNewStoreLclVarNode(finalLclNum, tlsRootVal);

    GenTree* tlsRootNullCond = gtNewOperNode(GT_NE, TYP_INT, gtCloneExpr(finalLcl), gtNewIconNode(0, TYP_I_IMPL));
    tlsRootNullCond          = gtNewOperNode(GT_JTRUE, TYP_VOID, tlsRootNullCond);

    // tlsRootNullCondBB
    BasicBlock* tlsRootNullCondBB = fgNewBBFromTreeAfter(BBJ_COND, prevBb, tlsRootAddrDef, debugInfo);
    fgInsertStmtAfter(tlsRootNullCondBB, tlsRootNullCondBB->firstStmt(), fgNewStmtFromTree(tlsRootNullCond));
    fgInsertStmtAfter(tlsRootNullCondBB, tlsRootNullCondBB->firstStmt(), fgNewStmtFromTree(tlsRootDef));

    CORINFO_CONST_LOOKUP threadStaticSlowHelper = threadStaticInfo.threadStaticBaseSlow;

    // See comments near finalLclNum above regarding TYP_REF
    GenTreeCall* slowHelper =
        gtNewIndCallNode(gtNewIconHandleNode((size_t)threadStaticSlowHelper.addr, GTF_ICON_TLS_HDL), TYP_REF);
    GenTree* helperArg = gtClone(tlsRootAddrUse);
    slowHelper->gtArgs.PushBack(this, NewCallArg::Primitive(helperArg));
    fgMorphArgs(slowHelper);

    // fallbackBb
    GenTree*    fallbackValueDef = gtNewStoreLclVarNode(finalLclNum, slowHelper);
    BasicBlock* fallbackBb = fgNewBBFromTreeAfter(BBJ_ALWAYS, tlsRootNullCondBB, fallbackValueDef, debugInfo, true);

    GenTree*    fastPathValueDef = gtNewStoreLclVarNode(finalLclNum, gtCloneExpr(finalLcl));
    BasicBlock* fastPathBb       = fgNewBBFromTreeAfter(BBJ_ALWAYS, fallbackBb, fastPathValueDef, debugInfo, true);

    *callUse = finalLcl;

    fgMorphStmtBlockOps(block, stmt);
    gtUpdateStmtSideEffects(stmt);

    //
    // Update preds in all new blocks
    //
    FlowEdge* const trueEdge  = fgAddRefPred(fastPathBb, tlsRootNullCondBB);
    FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, tlsRootNullCondBB);
    tlsRootNullCondBB->SetTrueEdge(trueEdge);
    tlsRootNullCondBB->SetFalseEdge(falseEdge);
    trueEdge->setLikelihood(1.0);
    falseEdge->setLikelihood(0.0);

    {
        FlowEdge* const newEdge = fgAddRefPred(block, fallbackBb);
        fallbackBb->SetTargetEdge(newEdge);
    }

    {
        FlowEdge* const newEdge = fgAddRefPred(block, fastPathBb);
        fastPathBb->SetTargetEdge(newEdge);
    }

    // Inherit the weights
    block->inheritWeight(prevBb);
    tlsRootNullCondBB->inheritWeight(prevBb);
    fastPathBb->inheritWeight(prevBb);

    // fallback will just execute first time
    fallbackBb->inheritWeightPercentage(tlsRootNullCondBB, 0);

    fgRedirectEdge(prevBb->TargetEdgeRef(), tlsRootNullCondBB);

    // All blocks are expected to be in the same EH region
    assert(BasicBlock::sameEHRegion(prevBb, block));
    assert(BasicBlock::sameEHRegion(prevBb, tlsRootNullCondBB));
    assert(BasicBlock::sameEHRegion(prevBb, fastPathBb));

    JITDUMP("tlsRootNullCondBB: " FMT_BB "\n", tlsRootNullCondBB->bbNum);
    JITDUMP("fallbackBb: " FMT_BB "\n", fallbackBb->bbNum);
    JITDUMP("fastPathBb: " FMT_BB "\n", fastPathBb->bbNum);
#else
    unreached();

#endif // TARGET_64BIT

    return true;
}

//------------------------------------------------------------------------------
// fgExpandThreadLocalAccessForCall : Expand the CORINFO_HELP_GETDYNAMIC_NONGCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED
//  or CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED, that access fields marked with [ThreadLocal].
//
// Arguments:
//    pBlock - Block containing the helper call to expand. If expansion is performed,
//             this is updated to the new block that was an outcome of block splitting.
//    stmt   - Statement containing the helper call
//    call   - The helper call
//
//
// Returns:
//    true if we expanded any field access, false otherwise.
//
// Notes:
//    A cache is stored in thread local storage (TLS) of coreclr. It maps the typeIndex (embedded in
//    the code at the JIT time) to the base of static blocks. This method generates code to
//    extract the TLS, get the entry at which the cache is stored. Then it checks if the typeIndex of
//    enclosing type of current field is present in the cache and if yes, extract out that can be directly
//    accessed at the uses.
//    If the entry is not present, the helper is called, which would make an entry of current static block
//    in the cache.
//
bool Compiler::fgExpandThreadLocalAccessForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    assert(!IsAot());

    BasicBlock* block = *pBlock;

    CorInfoHelpFunc helper = call->GetHelperNum();

    if ((helper != CORINFO_HELP_GETDYNAMIC_NONGCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED) &&
        (helper != CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED) &&
        (helper != CORINFO_HELP_GETDYNAMIC_NONGCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED2))
    {
        return false;
    }

    if (TargetOS::IsUnix)
    {
#if defined(TARGET_ARM) || !defined(TARGET_64BIT)
        // On Arm, Thread execution blocks are accessed using co-processor registers and instructions such
        // as MRC and MCR are used to access them. We do not support them and so should never optimize the
        // field access using TLS.
        noway_assert(!"Unsupported scenario of optimizing TLS access on Linux Arm32/x86");
#endif
    }
    else
    {
#ifdef TARGET_ARM
        // On Arm, Thread execution blocks are accessed using co-processor registers and instructions such
        // as MRC and MCR are used to access them. We do not support them and so should never optimize the
        // field access using TLS.
        noway_assert(!"Unsupported scenario of optimizing TLS access on Windows Arm32");
#endif
    }

    JITDUMP("Expanding thread static local access for [%06d] in " FMT_BB ":\n", dspTreeID(call), block->bbNum);
    DISPTREE(call);
    JITDUMP("\n");

    CORINFO_THREAD_STATIC_BLOCKS_INFO threadStaticBlocksInfo;
    memset(&threadStaticBlocksInfo, 0, sizeof(CORINFO_THREAD_STATIC_BLOCKS_INFO));

    info.compCompHnd->getThreadLocalStaticBlocksInfo(&threadStaticBlocksInfo);

    JITDUMP("getThreadLocalStaticBlocksInfo\n:");
    JITDUMP("tlsIndex= %p\n", dspPtr(threadStaticBlocksInfo.tlsIndex.addr));
    JITDUMP("tlsGetAddrFtnPtr= %p\n", dspPtr(threadStaticBlocksInfo.tlsGetAddrFtnPtr));
    JITDUMP("tlsIndexObject= %p\n", dspPtr(threadStaticBlocksInfo.tlsIndexObject));
    JITDUMP("threadVarsSection= %p\n", dspPtr(threadStaticBlocksInfo.threadVarsSection));
    JITDUMP("offsetOfThreadLocalStoragePointer= %u\n",
            dspOffset(threadStaticBlocksInfo.offsetOfThreadLocalStoragePointer));
    JITDUMP("offsetOfMaxThreadStaticBlocks= %u\n", dspOffset(threadStaticBlocksInfo.offsetOfMaxThreadStaticBlocks));
    JITDUMP("offsetOfThreadStaticBlocks= %u\n", dspOffset(threadStaticBlocksInfo.offsetOfThreadStaticBlocks));
    JITDUMP("offsetOfBaseOfThreadLocalData= %u\n", dspOffset(threadStaticBlocksInfo.offsetOfBaseOfThreadLocalData));

    assert(call->gtArgs.CountArgs() == 1);

    // Split block right before the call tree
    BasicBlock* prevBb       = block;
    GenTree**   callUse      = nullptr;
    Statement*  newFirstStmt = nullptr;
    DebugInfo   debugInfo    = stmt->GetDebugInfo();
    block                    = fgSplitBlockBeforeTree(block, stmt, call, &newFirstStmt, &callUse);
    *pBlock                  = block;
    var_types callType       = call->TypeGet();
    assert(prevBb != nullptr && block != nullptr);

    // Block ops inserted by the split need to be morphed here since we are after morph.
    // We cannot morph stmt yet as we may modify it further below, and the morphing
    // could invalidate callUse.
    while ((newFirstStmt != nullptr) && (newFirstStmt != stmt))
    {
        fgMorphStmtBlockOps(block, newFirstStmt);
        newFirstStmt = newFirstStmt->GetNextStmt();
    }

    GenTreeLclVar* threadStaticBlockLcl = nullptr;

    // Grab a temp to store result (it's assigned from either fastPathBb or fallbackBb)
    unsigned threadStaticBlockLclNum         = lvaGrabTemp(true DEBUGARG("TLS field access"));
    lvaTable[threadStaticBlockLclNum].lvType = callType;
    threadStaticBlockLcl                     = gtNewLclvNode(threadStaticBlockLclNum, callType);

    *callUse = gtClone(threadStaticBlockLcl);

    fgMorphStmtBlockOps(block, stmt);
    gtUpdateStmtSideEffects(stmt);

    GenTree* tlsValue                   = nullptr;
    unsigned tlsLclNum                  = lvaGrabTemp(true DEBUGARG("TLS access"));
    lvaTable[tlsLclNum].lvType          = TYP_I_IMPL;
    GenTree* maxThreadStaticBlocksValue = nullptr;
    GenTree* threadStaticBlocksValue    = nullptr;
    GenTree* tlsValueDef                = nullptr;

    if (TargetOS::IsWindows)
    {
        size_t   tlsIndexValue = (size_t)threadStaticBlocksInfo.tlsIndex.addr;
        GenTree* dllRef        = nullptr;
        if (tlsIndexValue != 0)
        {
            dllRef = gtNewIconHandleNode(tlsIndexValue * TARGET_POINTER_SIZE, GTF_ICON_TLS_HDL);
        }

        // Mark this ICON as a TLS_HDL, codegen will use FS:[cns] or GS:[cns]
        tlsValue = gtNewIconHandleNode(threadStaticBlocksInfo.offsetOfThreadLocalStoragePointer, GTF_ICON_TLS_HDL);
        tlsValue = gtNewIndir(TYP_I_IMPL, tlsValue, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        if (dllRef != nullptr)
        {
            // Add the dllRef to produce thread local storage reference for coreclr
            tlsValue = gtNewOperNode(GT_ADD, TYP_I_IMPL, tlsValue, dllRef);
        }

        // Base of coreclr's thread local storage
        tlsValue = gtNewIndir(TYP_I_IMPL, tlsValue, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);
    }
    else if (TargetOS::IsApplePlatform)
    {
        // For Apple x64/arm64, we need to get the address of relevant __thread_vars section of
        // the thread local variable `t_ThreadStatics`. Address of `tlv_get_address` is stored
        // in this entry, which we dereference and invoke it, passing the __thread_vars address
        // present in `threadVarsSection`.
        //
        // Code sequence to access thread local variable on Apple/x64:
        //
        //      mov rdi, threadVarsSection
        //      call     [rdi]
        //
        // Code sequence to access thread local variable on Apple/arm64:
        //
        //      mov x0, threadVarsSection
        //      mov x1, [x0]
        //      blr x1
        //
        size_t   threadVarsSectionVal = (size_t)threadStaticBlocksInfo.threadVarsSection;
        GenTree* tls_get_addr_val     = gtNewIconHandleNode(threadVarsSectionVal, GTF_ICON_FTN_ADDR);

        tls_get_addr_val = gtNewIndir(TYP_I_IMPL, tls_get_addr_val, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        tlsValue                = gtNewIndCallNode(tls_get_addr_val, TYP_I_IMPL);
        GenTreeCall* tlsRefCall = tlsValue->AsCall();

        // This is a call which takes an argument.
        // Populate and set the ABI appropriately.
        assert(opts.altJit || threadVarsSectionVal != 0);
        GenTree* tlsArg = gtNewIconNode(threadVarsSectionVal, TYP_I_IMPL);
        tlsRefCall->gtArgs.PushBack(this, NewCallArg::Primitive(tlsArg));

        fgMorphArgs(tlsRefCall);

        tlsRefCall->gtFlags |= GTF_EXCEPT | (tls_get_addr_val->gtFlags & GTF_GLOB_EFFECT);
    }
    else if (TargetOS::IsUnix)
    {
#if defined(TARGET_AMD64)
        // Code sequence to access thread local variable on linux/x64:
        //
        //      mov      rdi, 0x7FE5C418CD28  ; tlsIndexObject
        //      mov      rax, 0x7FE5C47AFDB0  ; _tls_get_addr
        //      call     rax
        //
        GenTree* tls_get_addr_val =
            gtNewIconHandleNode((size_t)threadStaticBlocksInfo.tlsGetAddrFtnPtr, GTF_ICON_FTN_ADDR);
        tlsValue                = gtNewIndCallNode(tls_get_addr_val, TYP_I_IMPL);
        GenTreeCall* tlsRefCall = tlsValue->AsCall();

        // This is an indirect call which takes an argument.
        // Populate and set the ABI appropriately.
        assert(opts.altJit || threadStaticBlocksInfo.tlsIndexObject != 0);
        GenTree* tlsArg = gtNewIconNode((size_t)threadStaticBlocksInfo.tlsIndexObject, TYP_I_IMPL);
        tlsRefCall->gtArgs.PushBack(this, NewCallArg::Primitive(tlsArg));

        fgMorphArgs(tlsRefCall);

        tlsRefCall->gtFlags |= GTF_EXCEPT | (tls_get_addr_val->gtFlags & GTF_GLOB_EFFECT);
#ifdef UNIX_X86_ABI
        tlsRefCall->gtFlags &= ~GTF_CALL_POP_ARGS;
#endif // UNIX_X86_ABI
#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
        // Code sequence to access thread local variable on linux/arm64:
        //
        //      mrs xt, tpidr_elf0
        //      mov xd, [xt+cns]
        //
        // Code sequence to access thread local variable on linux/loongarch64:
        //
        //      ori, targetReg, $tp, 0
        //      load rd, targetReg, cns
        //
        // Code sequence to access thread local variable on linux/riscv64:
        //
        //      mov targetReg, $tp
        //      ld rd, targetReg(cns)
        tlsValue = gtNewIconHandleNode(0, GTF_ICON_TLS_HDL);
#else
        assert(!"Unsupported scenario of optimizing TLS access on Linux Arm32/x86");
#endif
    }

    // Cache the tls value
    tlsValueDef                              = gtNewStoreLclVarNode(tlsLclNum, tlsValue);
    GenTree* tlsLclValueUse                  = gtNewLclVarNode(tlsLclNum);
    GenTree* typeThreadStaticBlockIndexValue = call->gtArgs.GetArgByIndex(0)->GetNode();
    assert(genActualType(typeThreadStaticBlockIndexValue) == TYP_INT);

    if (helper == CORINFO_HELP_GETDYNAMIC_NONGCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED2)
    {
        // In this case we can entirely remove the block which uses the helper call, and just get the TLS pointer
        // directly

        // prevBb (BBJ_NONE):                                               [weight: 1.0]
        //      ...
        //
        // tlsBaseComputeBB (BBJ_COND):                                     [weight: 1.0]
        //      tlsRoot = [tlsRootAddress]
        //
        // block (...):                                                     [weight: 1.0]
        //      use(tlsRoot);
        // ...

        typeThreadStaticBlockIndexValue = gtCloneExpr(typeThreadStaticBlockIndexValue);
#ifdef TARGET_64BIT
        typeThreadStaticBlockIndexValue = gtNewCastNode(TYP_I_IMPL, typeThreadStaticBlockIndexValue, true, TYP_I_IMPL);
        // Usually a constant, try to fold
        typeThreadStaticBlockIndexValue = gtFoldExpr(typeThreadStaticBlockIndexValue);
#endif

        GenTree* offset =
            gtNewOperNode(GT_ADD, TYP_I_IMPL, typeThreadStaticBlockIndexValue,
                          gtNewIconNode(threadStaticBlocksInfo.offsetOfBaseOfThreadLocalData, TYP_I_IMPL));
        offset = gtFoldExpr(offset);

        GenTree* threadStaticBase      = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(tlsLclValueUse), offset);
        GenTree* tlsStaticBaseStoreLcl = gtNewStoreLclVarNode(threadStaticBlockLclNum, threadStaticBase);

        BasicBlock* tlsBaseComputeBB = fgNewBBFromTreeAfter(BBJ_ALWAYS, prevBb, tlsValueDef, debugInfo, true);
        fgInsertStmtAfter(tlsBaseComputeBB, tlsBaseComputeBB->firstStmt(), fgNewStmtFromTree(tlsStaticBaseStoreLcl));

        //
        // Update preds in all new blocks
        //

        FlowEdge* const newEdge = fgAddRefPred(block, tlsBaseComputeBB);
        tlsBaseComputeBB->SetTargetEdge(newEdge);

        assert(prevBb->KindIs(BBJ_ALWAYS));
        fgRedirectEdge(prevBb->TargetEdgeRef(), tlsBaseComputeBB);

        // Inherit the weights
        block->inheritWeight(prevBb);
        tlsBaseComputeBB->inheritWeight(prevBb);

        // All blocks are expected to be in the same EH region
        assert(BasicBlock::sameEHRegion(prevBb, block));
        assert(BasicBlock::sameEHRegion(prevBb, tlsBaseComputeBB));
    }
    else
    {
        size_t offsetOfThreadStaticBlocksVal    = threadStaticBlocksInfo.offsetOfThreadStaticBlocks;
        size_t offsetOfMaxThreadStaticBlocksVal = threadStaticBlocksInfo.offsetOfMaxThreadStaticBlocks;

        // Create tree for "maxThreadStaticBlocks = tls[offsetOfMaxThreadStaticBlocks]"
        GenTree* offsetOfMaxThreadStaticBlocks = gtNewIconNode(offsetOfMaxThreadStaticBlocksVal, TYP_I_IMPL);
        GenTree* maxThreadStaticBlocksRef =
            gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(tlsLclValueUse), offsetOfMaxThreadStaticBlocks);
        maxThreadStaticBlocksValue =
            gtNewIndir(TYP_INT, maxThreadStaticBlocksRef, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        GenTree* threadStaticBlocksRef = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(tlsLclValueUse),
                                                       gtNewIconNode(offsetOfThreadStaticBlocksVal, TYP_I_IMPL));
        threadStaticBlocksValue = gtNewIndir(TYP_REF, threadStaticBlocksRef, GTF_IND_NONFAULTING | GTF_IND_INVARIANT);

        // Create tree for "if (maxThreadStaticBlocks < typeIndex)"
        GenTree* maxThreadStaticBlocksCond =
            gtNewOperNode(GT_LE, TYP_INT, maxThreadStaticBlocksValue, gtCloneExpr(typeThreadStaticBlockIndexValue));
        maxThreadStaticBlocksCond = gtNewOperNode(GT_JTRUE, TYP_VOID, maxThreadStaticBlocksCond);

        // Create tree to "threadStaticBlockValue = threadStaticBlockBase[typeIndex]"
        typeThreadStaticBlockIndexValue = gtNewOperNode(GT_MUL, TYP_INT, gtCloneExpr(typeThreadStaticBlockIndexValue),
                                                        gtNewIconNode(TARGET_POINTER_SIZE, TYP_INT));
        // Usually a constant, try to fold
        typeThreadStaticBlockIndexValue = gtFoldExpr(typeThreadStaticBlockIndexValue);
#ifdef TARGET_64BIT
        typeThreadStaticBlockIndexValue = gtNewCastNode(TYP_I_IMPL, typeThreadStaticBlockIndexValue, true, TYP_I_IMPL);

        // Usually a constant, try to fold
        typeThreadStaticBlockIndexValue = gtFoldExpr(typeThreadStaticBlockIndexValue);
#endif
        GenTree* typeThreadStaticBlockRef =
            gtNewOperNode(GT_ADD, TYP_BYREF, threadStaticBlocksValue, typeThreadStaticBlockIndexValue);
        GenTree* typeThreadStaticBlockValue = gtNewIndir(TYP_BYREF, typeThreadStaticBlockRef, GTF_IND_NONFAULTING);

        // Cache the threadStaticBlock value
        unsigned threadStaticBlockBaseLclNum         = lvaGrabTemp(true DEBUGARG("ThreadStaticBlockBase access"));
        lvaTable[threadStaticBlockBaseLclNum].lvType = TYP_BYREF;
        GenTree* threadStaticBlockBaseDef =
            gtNewStoreLclVarNode(threadStaticBlockBaseLclNum, typeThreadStaticBlockValue);
        GenTree* threadStaticBlockBaseLclValueUse = gtNewLclVarNode(threadStaticBlockBaseLclNum);

        // Create tree for "if (threadStaticBlockValue != nullptr)"
        GenTree* threadStaticBlockNullCond =
            gtNewOperNode(GT_NE, TYP_INT, threadStaticBlockBaseLclValueUse, gtNewIconNode(0, TYP_I_IMPL));
        threadStaticBlockNullCond = gtNewOperNode(GT_JTRUE, TYP_VOID, threadStaticBlockNullCond);

        // prevBb (BBJ_ALWAYS):                                             [weight: 1.0]
        //      ...
        //
        // maxThreadStaticBlocksCondBB (BBJ_COND):                          [weight: 1.0]
        //      tlsValue = tls_access_code
        //      if (maxThreadStaticBlocks <= typeIndex)
        //          goto fallbackBb;
        //
        // threadStaticBlockNullCondBB (BBJ_COND):                          [weight: 1.0]
        //      fastPathValue = t_threadStaticBlocks[typeIndex]
        //      if (fastPathValue != nullptr)
        //          goto fastPathBb;
        //
        // fallbackBb (BBJ_ALWAYS):                                         [weight: 0]
        //      threadStaticBlockBase = HelperCall();
        //      goto block;
        //
        // fastPathBb(BBJ_ALWAYS):                                          [weight: 1.0]
        //      threadStaticBlockBase = fastPathValue;
        //
        // block (...):                                                     [weight: 1.0]
        //      use(threadStaticBlockBase);

        // maxThreadStaticBlocksCondBB

        // maxThreadStaticBlocksCondBB conditionally jumps to fallbackBb, but fallbackBb must be initialized last
        // so it can be placed after it. So set the jump target later.
        BasicBlock* maxThreadStaticBlocksCondBB = fgNewBBFromTreeAfter(BBJ_COND, prevBb, tlsValueDef, debugInfo);

        fgInsertStmtAfter(maxThreadStaticBlocksCondBB, maxThreadStaticBlocksCondBB->firstStmt(),
                          fgNewStmtFromTree(maxThreadStaticBlocksCond));

        // Similarly, set threadStaticBlockNulLCondBB to jump to fastPathBb once the latter exists.
        BasicBlock* threadStaticBlockNullCondBB =
            fgNewBBFromTreeAfter(BBJ_COND, maxThreadStaticBlocksCondBB, threadStaticBlockBaseDef, debugInfo);
        fgInsertStmtAfter(threadStaticBlockNullCondBB, threadStaticBlockNullCondBB->firstStmt(),
                          fgNewStmtFromTree(threadStaticBlockNullCond));

        // fallbackBb
        GenTree*    fallbackValueDef = gtNewStoreLclVarNode(threadStaticBlockLclNum, call);
        BasicBlock* fallbackBb =
            fgNewBBFromTreeAfter(BBJ_ALWAYS, threadStaticBlockNullCondBB, fallbackValueDef, debugInfo, true);

        GenTree* fastPathValueDef =
            gtNewStoreLclVarNode(threadStaticBlockLclNum, gtCloneExpr(threadStaticBlockBaseLclValueUse));
        BasicBlock* fastPathBb = fgNewBBFromTreeAfter(BBJ_ALWAYS, fallbackBb, fastPathValueDef, debugInfo, true);

        //
        // Update preds in all new blocks
        //
        assert(prevBb->KindIs(BBJ_ALWAYS));
        fgRedirectEdge(prevBb->TargetEdgeRef(), maxThreadStaticBlocksCondBB);

        {
            FlowEdge* const trueEdge  = fgAddRefPred(fallbackBb, maxThreadStaticBlocksCondBB);
            FlowEdge* const falseEdge = fgAddRefPred(threadStaticBlockNullCondBB, maxThreadStaticBlocksCondBB);
            maxThreadStaticBlocksCondBB->SetTrueEdge(trueEdge);
            maxThreadStaticBlocksCondBB->SetFalseEdge(falseEdge);
            trueEdge->setLikelihood(0.0);
            falseEdge->setLikelihood(1.0);
        }

        {
            FlowEdge* const trueEdge  = fgAddRefPred(fastPathBb, threadStaticBlockNullCondBB);
            FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, threadStaticBlockNullCondBB);
            threadStaticBlockNullCondBB->SetTrueEdge(trueEdge);
            threadStaticBlockNullCondBB->SetFalseEdge(falseEdge);
            trueEdge->setLikelihood(1.0);
            falseEdge->setLikelihood(0.0);
        }

        {
            FlowEdge* const newEdge = fgAddRefPred(block, fastPathBb);
            fastPathBb->SetTargetEdge(newEdge);
        }

        {
            FlowEdge* const newEdge = fgAddRefPred(block, fallbackBb);
            fallbackBb->SetTargetEdge(newEdge);
        }

        // Inherit the weights
        block->inheritWeight(prevBb);
        maxThreadStaticBlocksCondBB->inheritWeight(prevBb);
        threadStaticBlockNullCondBB->inheritWeight(prevBb);
        fastPathBb->inheritWeight(prevBb);

        // fallback will just execute first time
        fallbackBb->inheritWeightPercentage(prevBb, 0);

        // All blocks are expected to be in the same EH region
        assert(BasicBlock::sameEHRegion(prevBb, block));
        assert(BasicBlock::sameEHRegion(prevBb, maxThreadStaticBlocksCondBB));
        assert(BasicBlock::sameEHRegion(prevBb, threadStaticBlockNullCondBB));
        assert(BasicBlock::sameEHRegion(prevBb, fastPathBb));
    }

    return true;
}

//------------------------------------------------------------------------------
// fgExpandHelper: Expand the helper using ExpansionFunction.
//
// Returns:
//    true if there was any helper that was expanded.
//
template <bool (Compiler::*ExpansionFunction)(BasicBlock**, Statement*, GenTreeCall*)>
PhaseStatus Compiler::fgExpandHelper(bool skipRarelyRunBlocks)
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;
    for (BasicBlock* block = fgFirstBB; block != nullptr; block = block->Next())
    {
        if (skipRarelyRunBlocks && block->isRunRarely())
        {
            // It's just an optimization - don't waste time on rarely executed blocks
            continue;
        }

        // Expand and visit the last block again to find more candidates
        INDEBUG(BasicBlock* origBlock = block);
        while (fgExpandHelperForBlock<ExpansionFunction>(&block))
        {
            result = PhaseStatus::MODIFIED_EVERYTHING;
#ifdef DEBUG
            assert(origBlock != block);
            origBlock = block;
#endif
        }
    }

    if (result == PhaseStatus::MODIFIED_EVERYTHING)
    {
        fgInvalidateDfsTree();
    }

    return result;
}

//------------------------------------------------------------------------------
// fgExpandHelperForBlock: Scans through all the statements of the `block` and
//    invoke `fgExpand` if any of the tree node was a helper call.
//
// Arguments:
//    pBlock   - Block containing the helper call to expand. If expansion is performed,
//               this is updated to the new block that was an outcome of block splitting.
//    fgExpand - function that expands the helper call
//
// Returns:
//    true if a helper was expanded
//
template <bool (Compiler::*ExpansionFunction)(BasicBlock**, Statement*, GenTreeCall*)>
bool Compiler::fgExpandHelperForBlock(BasicBlock** pBlock)
{
    for (Statement* const stmt : (*pBlock)->NonPhiStatements())
    {
        if ((stmt->GetRootNode()->gtFlags & GTF_CALL) == 0)
        {
            // TP: Stmt has no calls - bail out
            continue;
        }

        for (GenTree* const tree : stmt->TreeList())
        {
            if (!tree->IsCall())
            {
                continue;
            }

            if ((this->*ExpansionFunction)(pBlock, stmt, tree->AsCall()))
            {
                return true;
            }
        }
    }
    return false;
}

//------------------------------------------------------------------------------
// fgExpandStaticInit: Partially expand static initialization calls, e.g.:
//
//    tmp = CORINFO_HELP_X_NONGCSTATIC_BASE();
//
// into:
//
//    if (isClassAlreadyInited)
//        CORINFO_HELP_X_NONGCSTATIC_BASE();
//    tmp = fastPath;
//
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
PhaseStatus Compiler::fgExpandStaticInit()
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;

    if (!doesMethodHaveStaticInit())
    {
        // TP: nothing to expand in the current method
        JITDUMP("Nothing to expand.\n")
        return result;
    }

    // Always expand for NativeAOT, see
    // https://github.com/dotnet/runtime/issues/68278#issuecomment-1543322819
    const bool isNativeAOT = IsTargetAbi(CORINFO_NATIVEAOT_ABI);

    if (!isNativeAOT && opts.OptimizationDisabled())
    {
        JITDUMP("Optimizations aren't allowed - bail out.\n")
        return result;
    }

    return fgExpandHelper<&Compiler::fgExpandStaticInitForCall>(/*skipRarelyRunBlocks*/ !isNativeAOT);
}

//------------------------------------------------------------------------------
// fgExpandStaticInitForCall: Partially expand given static initialization call.
//    Also, see fgExpandStaticInit's comments.
//
// Arguments:
//    pBlock - Block containing the helper call to expand. If expansion is performed,
//             this is updated to the new block that was an outcome of block splitting.
//    stmt   - Statement containing the helper call
//    call   - The helper call
//
// Returns:
//    true if a static initialization was expanded
//
bool Compiler::fgExpandStaticInitForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    BasicBlock* block = *pBlock;
    if (!call->IsHelperCall())
    {
        return false;
    }

    bool                    isGc       = false;
    StaticHelperReturnValue retValKind = {};
    if (!IsStaticHelperEligibleForExpansion(call, &isGc, &retValKind))
    {
        return false;
    }

    assert(!call->IsTailCall());

    if (call->gtInitClsHnd == NO_CLASS_HANDLE)
    {
        assert(!"helper call was created without gtInitClsHnd or already visited");
        return false;
    }

    int                  isInitOffset = 0;
    CORINFO_CONST_LOOKUP flagAddr     = {};
    if (!info.compCompHnd->getIsClassInitedFlagAddress(call->gtInitClsHnd, &flagAddr, &isInitOffset))
    {
        JITDUMP("getIsClassInitedFlagAddress returned false - bail out.\n")
        return false;
    }

    CORINFO_CONST_LOOKUP staticBaseAddr = {};
    if ((retValKind == SHRV_STATIC_BASE_PTR) &&
        !info.compCompHnd->getStaticBaseAddress(call->gtInitClsHnd, isGc, &staticBaseAddr))
    {
        JITDUMP("getStaticBaseAddress returned false - bail out.\n")
        return false;
    }

    JITDUMP("Expanding static initialization for '%s', call: [%06d] in " FMT_BB "\n",
            eeGetClassName(call->gtInitClsHnd), dspTreeID(call), block->bbNum);

    DebugInfo debugInfo = stmt->GetDebugInfo();

    // Split block right before the call tree
    BasicBlock* prevBb       = block;
    GenTree**   callUse      = nullptr;
    Statement*  newFirstStmt = nullptr;
    block                    = fgSplitBlockBeforeTree(block, stmt, call, &newFirstStmt, &callUse);
    *pBlock                  = block;
    assert(prevBb != nullptr && block != nullptr);

    // Block ops inserted by the split need to be morphed here since we are after morph.
    // We cannot morph stmt yet as we may modify it further below, and the morphing
    // could invalidate callUse.
    while ((newFirstStmt != nullptr) && (newFirstStmt != stmt))
    {
        fgMorphStmtBlockOps(block, newFirstStmt);
        newFirstStmt = newFirstStmt->GetNextStmt();
    }

    //
    // Create new blocks. Essentially, we want to transform this:
    //
    //   staticBase = helperCall();
    //
    // into:
    //
    //   if (!isInitialized)
    //   {
    //       helperCall(); // we don't use its return value
    //   }
    //   staticBase = fastPath;
    //

    // The initialization check looks like this for JIT:
    //
    // *  JTRUE     void
    // \--*  EQ        int
    //    +--*  AND       int
    //    |  +--*  IND       int
    //    |  |  \--*  CNS_INT(h) long   0x.... const ptr
    //    |  \--*  CNS_INT   int    1 (bit mask)
    //    \--*  CNS_INT   int    1
    //
    // For NativeAOT it's:
    //
    // *  JTRUE     void
    // \--*  EQ        int
    //    +--*  IND       nint
    //    |  \--*  ADD       long
    //    |     +--*  CNS_INT(h) long   0x.... const ptr
    //    |     \--*  CNS_INT   int    -8 (offset)
    //    \--*  CNS_INT   int    0
    //
    assert(flagAddr.accessType == IAT_VALUE);

    GenTree* cachedStaticBase = nullptr;
    GenTree* isInitedActualValueNode;
    GenTree* isInitedExpectedValue;
    if (IsTargetAbi(CORINFO_NATIVEAOT_ABI))
    {
        GenTree* baseAddr = gtNewIconHandleNode((size_t)flagAddr.addr, GTF_ICON_GLOBAL_PTR);

        // Save it to a temp - we'll be using its value for the replacementNode.
        // This leads to some size savings on NativeAOT
        if ((staticBaseAddr.addr == flagAddr.addr) && (staticBaseAddr.accessType == flagAddr.accessType))
        {
            cachedStaticBase = fgInsertCommaFormTemp(&baseAddr);
        }

        // Don't fold ADD(CNS1, CNS2) here since the result won't be reloc-friendly for AOT
        GenTree* offsetNode     = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseAddr, gtNewIconNode(isInitOffset, TYP_I_IMPL));
        isInitedActualValueNode = gtNewIndir(TYP_I_IMPL, offsetNode, GTF_IND_NONFAULTING | GTF_IND_VOLATILE);

        // 0 means "initialized" on NativeAOT
        isInitedExpectedValue = gtNewIconNode(0, TYP_I_IMPL);
    }
    else
    {
        assert(isInitOffset == 0);

        isInitedActualValueNode = gtNewIndOfIconHandleNode(TYP_INT, (size_t)flagAddr.addr, GTF_ICON_GLOBAL_PTR);
        isInitedActualValueNode->gtFlags |= GTF_IND_VOLATILE;
        isInitedActualValueNode->SetHasOrderingSideEffect();

        // Check ClassInitFlags::INITIALIZED_FLAG bit
        isInitedActualValueNode = gtNewOperNode(GT_AND, TYP_INT, isInitedActualValueNode, gtNewIconNode(1));
        isInitedExpectedValue   = gtNewIconNode(1);
    }

    GenTree* isInitedCmp = gtNewOperNode(GT_EQ, TYP_INT, isInitedActualValueNode, isInitedExpectedValue);
    isInitedCmp->gtFlags |= GTF_RELOP_JMP_USED;
    BasicBlock* isInitedBb =
        fgNewBBFromTreeAfter(BBJ_COND, prevBb, gtNewOperNode(GT_JTRUE, TYP_VOID, isInitedCmp), debugInfo);

    // Fallback basic block
    // TODO-CQ: for JIT we can replace the original call with CORINFO_HELP_INITCLASS
    // that only accepts a single argument
    BasicBlock* helperCallBb = fgNewBBFromTreeAfter(BBJ_ALWAYS, isInitedBb, call, debugInfo, true);

    GenTree* replacementNode = nullptr;
    if (retValKind == SHRV_STATIC_BASE_PTR)
    {
        // Replace the call with a constant pointer to the statics base
        assert(staticBaseAddr.addr != nullptr);

        // Use local if the addressed is already materialized and cached
        if (cachedStaticBase != nullptr)
        {
            assert(staticBaseAddr.accessType == IAT_VALUE);
            replacementNode = cachedStaticBase;
        }
        else if (staticBaseAddr.accessType == IAT_VALUE)
        {
            replacementNode = gtNewIconHandleNode((size_t)staticBaseAddr.addr, GTF_ICON_STATIC_HDL);
        }
        else
        {
            assert(staticBaseAddr.accessType == IAT_PVALUE);
            replacementNode = gtNewIndOfIconHandleNode(TYP_I_IMPL, (size_t)staticBaseAddr.addr, GTF_ICON_GLOBAL_PTR);
        }
    }

    if (replacementNode == nullptr)
    {
        (*callUse)->gtBashToNOP();
    }
    else
    {
        *callUse = replacementNode;
    }

    fgMorphStmtBlockOps(block, stmt);
    gtUpdateStmtSideEffects(stmt);

    // Final block layout looks like this:
    //
    // prevBb(BBJ_ALWAYS):                  [weight: 1.0]
    //     ...
    //
    // isInitedBb(BBJ_COND):                [weight: 1.0]
    //     if (isInited)
    //         goto block;
    //
    // helperCallBb(BBJ_ALWAYS):            [weight: 0.0]
    //     helperCall();
    //
    // block(...):                          [weight: 1.0]
    //     use(staticBase);
    //
    // Whether we use helperCall's value or not depends on the helper itself.

    //
    // Update preds in all new blocks
    //

    // Redirect prevBb from block to isInitedBb
    fgRedirectEdge(prevBb->TargetEdgeRef(), isInitedBb);
    assert(prevBb->JumpsToNext());

    {
        // Block has two preds now: either isInitedBb or helperCallBb
        FlowEdge* const newEdge = fgAddRefPred(block, helperCallBb);
        helperCallBb->SetTargetEdge(newEdge);
        assert(helperCallBb->JumpsToNext());
    }

    {
        // Both fastPathBb and helperCallBb have a single common pred - isInitedBb
        FlowEdge* const trueEdge  = fgAddRefPred(block, isInitedBb);
        FlowEdge* const falseEdge = fgAddRefPred(helperCallBb, isInitedBb);
        isInitedBb->SetTrueEdge(trueEdge);
        isInitedBb->SetFalseEdge(falseEdge);
        trueEdge->setLikelihood(1.0);
        falseEdge->setLikelihood(0.0);
    }

    //
    // Re-distribute weights
    //

    block->inheritWeight(prevBb);
    isInitedBb->inheritWeight(prevBb);
    helperCallBb->inheritWeightPercentage(isInitedBb, 0);

    //
    // Update flags in all new blocks
    //

    InheritFlags(block, prevBb);
    InheritFlags(isInitedBb, prevBb);
    InheritFlags(helperCallBb, prevBb);

    // All blocks are expected to be in the same EH region
    assert(BasicBlock::sameEHRegion(prevBb, block));
    assert(BasicBlock::sameEHRegion(prevBb, isInitedBb));

    // Extra step: merge prevBb with isInitedBb if possible
    if (fgCanCompactBlock(prevBb))
    {
        fgCompactBlock(prevBb);
    }

    // Clear gtInitClsHnd as a mark that we've already visited this call
    call->gtInitClsHnd = NO_CLASS_HANDLE;

    // The blocks and statements have been modified enough to make the ending offset invalid.
    block->bbCodeOffsEnd = BAD_IL_OFFSET;

    return true;
}

//------------------------------------------------------------------------------
// fgVNBasedIntrinsicExpansion: Expand specific calls marked as intrinsics using VN.
//
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
PhaseStatus Compiler::fgVNBasedIntrinsicExpansion()
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;

    if (!doesMethodHaveSpecialIntrinsics() || opts.OptimizationDisabled())
    {
        return result;
    }

    // TODO: Replace with opts.compCodeOpt once it's fixed
    const bool preferSize = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_SIZE_OPT);
    if (preferSize)
    {
        // The optimization comes with a codegen size increase
        JITDUMP("Optimized for size - bail out.\n")
        return result;
    }
    return fgExpandHelper<&Compiler::fgVNBasedIntrinsicExpansionForCall>(true);
}

//------------------------------------------------------------------------------
// fgVNBasedIntrinsicExpansionForCall : Expand specific calls marked as intrinsics using VN.
//
// Arguments:
//    block - Block containing the intrinsic call to expand
//    stmt  - Statement containing the call
//    call  - The intrinsic call
//
// Returns:
//    True if expanded, false otherwise.
//
bool Compiler::fgVNBasedIntrinsicExpansionForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    if (!call->IsSpecialIntrinsic())
    {
        return false;
    }

    NamedIntrinsic ni = lookupNamedIntrinsic(call->gtCallMethHnd);
    if (ni == NI_System_Text_UTF8Encoding_UTF8EncodingSealed_ReadUtf8)
    {
        return fgVNBasedIntrinsicExpansionForCall_ReadUtf8(pBlock, stmt, call);
    }

    // TODO: Expand IsKnownConstant here
    // Also, move various unrollings here

    return false;
}

//------------------------------------------------------------------------------
// fgVNBasedIntrinsicExpansionForCall_ReadUtf8 : Expand NI_System_Text_UTF8Encoding_UTF8EncodingSealed_ReadUtf8
//    when src data is a string literal (UTF16) that can be converted to UTF8, e.g.:
//
//      string str = "Hello, world!";
//      int bytesWritten = ReadUtf8(ref str[0], str.Length, buffer, buffer.Length);
//
//    becomes:
//
//      bytesWritten = 0; // default value
//      if (buffer.Length >= 13)
//      {
//          memcpy(buffer, "Hello, world!"u8, 13); // note the u8 suffix
//          bytesWritten = 13;
//      }
//
// Arguments:
//    block - Block containing the intrinsic call to expand
//    stmt  - Statement containing the call
//    call  - The intrinsic call
//
// Returns:
//    True if expanded, false otherwise.
//
bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    BasicBlock* block = *pBlock;

    assert(call->gtArgs.CountUserArgs() == 4);

    GenTree* srcPtr = call->gtArgs.GetUserArgByIndex(0)->GetNode();

    // We're interested in a case when srcPtr is a string literal and srcLen is a constant
    // srcLen doesn't have to match the srcPtr's length, but it should not exceed it.
    ssize_t               strObjOffset = 0;
    CORINFO_OBJECT_HANDLE strObj       = nullptr;
    if (!GetObjectHandleAndOffset(srcPtr, &strObjOffset, &strObj) || ((size_t)strObjOffset > INT_MAX))
    {
        // We might want to support more cases here, e.g. ROS<char> RVA data.
        // Also, we check that strObjOffset (which is in most cases is expected to be just
        // OFFSETOF__CORINFO_String__chars) doesn't exceed INT_MAX since we'll need to cast
        // it to int for getObjectContent API.
        JITDUMP("ReadUtf8: srcPtr is not an object handle\n")
        return false;
    }

    assert(strObj != nullptr);

    // We mostly expect string literal objects here, but let's be more agile just in case
    if (!info.compCompHnd->isObjectImmutable(strObj))
    {
        JITDUMP("ReadUtf8: srcPtr is not immutable (not a frozen string object?)\n")
        return false;
    }

    const GenTree* srcLen = call->gtArgs.GetUserArgByIndex(1)->GetNode();
    if (!srcLen->gtVNPair.BothEqual() || !vnStore->IsVNInt32Constant(srcLen->gtVNPair.GetLiberal()))
    {
        JITDUMP("ReadUtf8: srcLen is not constant\n")
        return false;
    }

    // Source UTF16 (U16) string length in characters
    const unsigned srcLenCnsU16            = (unsigned)vnStore->GetConstantInt32(srcLen->gtVNPair.GetLiberal());
    const int      MaxU16BufferSizeInChars = 256;
    if ((srcLenCnsU16 == 0) || (srcLenCnsU16 > MaxU16BufferSizeInChars))
    {
        // TODO: handle srcLenCns == 0 if it's a common case
        JITDUMP("ReadUtf8: srcLenCns is 0 or > MaxPossibleUnrollThreshold\n")
        return false;
    }

    uint16_t bufferU16[MaxU16BufferSizeInChars];

    // getObjectContent is expected to validate the offset and length
    // NOTE: (int) casts should not overflow:
    //  * srcLenCns is <= MaxUTF16BufferSizeInChars
    //  * strObjOffset is already checked to be <= INT_MAX
    if (!info.compCompHnd->getObjectContent(strObj, (uint8_t*)bufferU16, (int)(srcLenCnsU16 * sizeof(uint16_t)),
                                            (int)strObjOffset))
    {
        JITDUMP("ReadUtf8: getObjectContent returned false.\n")
        return false;
    }

    const int MaxU8BufferSizeInBytes = 256;
    uint8_t   bufferU8[MaxU8BufferSizeInBytes];

    const int srcLenU8 = (int)minipal_convert_utf16_to_utf8((const CHAR16_T*)bufferU16, srcLenCnsU16, (char*)bufferU8,
                                                            MaxU8BufferSizeInBytes, 0);
    if (srcLenU8 <= 0)
    {
        // E.g. output buffer is too small
        JITDUMP("ReadUtf8: minipal_convert_utf16_to_utf8 returned <= 0\n")
        return false;
    }

    // The API is expected to return [1..MaxU8BufferSizeInBytes] real length of the UTF-8 value
    // stored in bufferU8
    assert((unsigned)srcLenU8 <= MaxU8BufferSizeInBytes);

    // Now that we know the exact UTF8 buffer length we can check if it's unrollable
    if (srcLenU8 > (int)getUnrollThreshold(UnrollKind::Memcpy))
    {
        JITDUMP("ReadUtf8: srcLenU8 is out of unrollable range\n")
        return false;
    }

    DebugInfo debugInfo = stmt->GetDebugInfo();

    BasicBlock*    prevBb;
    const unsigned resultLclNum = SplitAtTreeAndReplaceItWithLocal(this, block, stmt, call, &prevBb, &block);

    *pBlock = block;

    // If we suddenly need to use these arguments, we'll have to reload them from the call
    // after the split, so let's null them to prevent accidental use.
    srcLen = nullptr;
    srcPtr = nullptr;

    // We don't need this flag anymore.
    call->gtCallMoreFlags &= ~GTF_CALL_M_SPECIAL_INTRINSIC;

    // srcLenU8 is the length of the string literal in chars (UTF16)
    // but we're going to use the same value as the "bytesWritten" result in the fast path and in the length check.
    GenTree* srcLenU8Node = gtNewIconNode(srcLenU8);
    fgValueNumberTreeConst(srcLenU8Node);

    // We're going to insert the following blocks:
    //
    //  prevBb:
    //
    //  lengthCheckBb:
    //      bytesWritten = -1;
    //      if (dstLen < srcLenU8)
    //          goto block;
    //
    //  fastpathBb:
    //      <unrolled block copy>
    //      bytesWritten = srcLenU8;
    //
    //  block:
    //      use(bytesWritten)
    //

    //
    // Block 1: lengthCheckBb (we check that dstLen < srcLen)
    //
    BasicBlock* const lengthCheckBb = fgNewBBafter(BBJ_COND, prevBb, true);
    lengthCheckBb->SetFlags(BBF_INTERNAL);

    // Set bytesWritten -1 by default, if the fast path is not taken we'll return it as the result.
    GenTree* bytesWrittenDefaultVal = gtNewStoreLclVarNode(resultLclNum, gtNewIconNode(-1));
    fgInsertStmtAtEnd(lengthCheckBb, fgNewStmtFromTree(bytesWrittenDefaultVal, debugInfo));

    GenTree* dstLen      = call->gtArgs.GetUserArgByIndex(3)->GetNode();
    GenTree* lengthCheck = gtNewOperNode(GT_LT, TYP_INT, gtCloneExpr(dstLen), srcLenU8Node);
    lengthCheck->gtFlags |= GTF_RELOP_JMP_USED;
    Statement* lengthCheckStmt = fgNewStmtFromTree(gtNewOperNode(GT_JTRUE, TYP_VOID, lengthCheck), debugInfo);
    fgInsertStmtAtEnd(lengthCheckBb, lengthCheckStmt);
    lengthCheckBb->bbCodeOffs    = block->bbCodeOffsEnd;
    lengthCheckBb->bbCodeOffsEnd = block->bbCodeOffsEnd;

    //
    // Block 2: fastpathBb - unrolled loop that copies the UTF8 const data to the destination
    //
    // We're going to emit a series of loads and stores to copy the data.
    // In theory, we could just emit the const U8 data to the data section and use GT_BLK here
    // but that would be a bit less efficient since we would have to load the data from memory.
    //
    BasicBlock* fastpathBb = fgNewBBafter(BBJ_ALWAYS, lengthCheckBb, true);
    fastpathBb->SetFlags(BBF_INTERNAL);

    // The widest type we can use for loads
    const var_types maxLoadType = roundDownMaxType(srcLenU8);
    assert(genTypeSize(maxLoadType) > 0);

    // How many iterations we need to copy UTF8 const data to the destination
    unsigned iterations = srcLenU8 / genTypeSize(maxLoadType);

    // Add one more iteration if we have a remainder
    iterations += (srcLenU8 % genTypeSize(maxLoadType) == 0) ? 0 : 1;

    GenTree* dstPtr = call->gtArgs.GetUserArgByIndex(2)->GetNode();
    for (unsigned i = 0; i < iterations; i++)
    {
        ssize_t offset = (ssize_t)i * genTypeSize(maxLoadType);

        // Last iteration: overlap with previous load if needed
        if (i == iterations - 1)
        {
            offset = (ssize_t)srcLenU8 - genTypeSize(maxLoadType);
        }

        // We're going to emit the following tree (in case of SIMD16 load):
        //
        // -A-XG------         *  STOREIND  simd16 (copy)
        // -------N---         +--*  ADD       byref
        // -----------         |  +--*  LCL_VAR   byref
        // -----------         |  \--*  CNS_INT   int
        // -----------         \--*  CNS_VEC   simd16

        GenTreeIntCon* offsetNode = gtNewIconNode(offset, TYP_I_IMPL);
        fgValueNumberTreeConst(offsetNode);

        // Grab a chunk from srcUtf8cnsData for the given offset and width
        GenTree* utf8cnsChunkNode = gtNewGenericCon(maxLoadType, bufferU8 + offset);
        fgValueNumberTreeConst(utf8cnsChunkNode);

        GenTree*   dstAddOffsetNode = gtNewOperNode(GT_ADD, dstPtr->TypeGet(), gtCloneExpr(dstPtr), offsetNode);
        GenTreeOp* storeInd         = gtNewStoreIndNode(maxLoadType, dstAddOffsetNode, utf8cnsChunkNode);
        fgInsertStmtAtEnd(fastpathBb, fgNewStmtFromTree(storeInd, debugInfo));
    }

    // Finally, store the number of bytes written to the resultLcl local
    Statement* finalStmt = fgNewStmtFromTree(gtNewStoreLclVarNode(resultLclNum, gtCloneExpr(srcLenU8Node)), debugInfo);
    fgInsertStmtAtEnd(fastpathBb, finalStmt);
    fastpathBb->bbCodeOffs    = block->bbCodeOffsEnd;
    fastpathBb->bbCodeOffsEnd = block->bbCodeOffsEnd;

    //
    // Update preds in all new blocks
    //
    // Redirect prevBb to lengthCheckBb
    fgRedirectEdge(prevBb->TargetEdgeRef(), lengthCheckBb);
    lengthCheckBb->inheritWeight(prevBb);
    assert(prevBb->JumpsToNext());

    {
        // lengthCheckBb has two successors: block and fastpathBb
        FlowEdge* const trueEdge  = fgAddRefPred(block, lengthCheckBb);
        FlowEdge* const falseEdge = fgAddRefPred(fastpathBb, lengthCheckBb);
        lengthCheckBb->SetTrueEdge(trueEdge);
        lengthCheckBb->SetFalseEdge(falseEdge);

        // review: we assume length check always succeeds??
        trueEdge->setLikelihood(1.0);
        falseEdge->setLikelihood(0.0);

        if (lengthCheckBb->hasProfileWeight())
        {
            fastpathBb->setBBProfileWeight(falseEdge->getLikelyWeight());
        }
    }

    {
        // fastpathBb flows into block
        FlowEdge* const newEdge = fgAddRefPred(block, fastpathBb);
        fastpathBb->SetTargetEdge(newEdge);
        assert(fastpathBb->JumpsToNext());
    }

    //
    // Ensure all flow out of prevBb converges into block
    //
    block->inheritWeight(prevBb);

    // All blocks are expected to be in the same EH region
    assert(BasicBlock::sameEHRegion(prevBb, block));
    assert(BasicBlock::sameEHRegion(prevBb, lengthCheckBb));
    assert(BasicBlock::sameEHRegion(prevBb, fastpathBb));

    // Extra step: merge prevBb with lengthCheckBb if possible
    if (fgCanCompactBlock(prevBb))
    {
        fgCompactBlock(prevBb);
    }

    JITDUMP("ReadUtf8: succesfully expanded!\n")
    return true;
}

//------------------------------------------------------------------------------
// fgLateCastExpansion: Partially inline various cast helpers, e.g.:
//
//    tmp = CORINFO_HELP_ISINSTANCEOFINTERFACE(clsHandle, obj);
//
// into:
//
//    tmp = obj;
//    if ((obj != null) && (obj->pMT != likelyClassHandle))
//    {
//        tmp = CORINFO_HELP_ISINSTANCEOFINTERFACE(clsHandle, obj);
//    }
//
// The goal is to move cast expansion logic from the importer to this phase, for now,
// this phase only supports "isinst" and for profiled casts only.
//
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
PhaseStatus Compiler::fgLateCastExpansion()
{
    if (!doesMethodHaveExpandableCasts() || opts.OptimizationDisabled())
    {
        // Nothing to expand in the current method
        return PhaseStatus::MODIFIED_NOTHING;
    }

    // TODO-InlineCast: should we still inline some trivial cases even in cold blocks?
    const bool skipForRarelyRunBlocks = true;
    return fgExpandHelper<&Compiler::fgLateCastExpansionForCall>(skipForRarelyRunBlocks);
}

enum class TypeCheckFailedAction
{
    Unknown,
    ReturnNull,
    CallHelper,
    CallHelper_Specialized,
    CallHelper_AlwaysThrows
};

enum class TypeCheckPassedAction
{
    Unknown,
    ReturnObj,
    ReturnNull,
    CallHelper_AlwaysThrows,
};

//------------------------------------------------------------------------------
// PickCandidatesForTypeCheck: picks classes to use as fast type checks against
//    the object being casted. The function also defines the strategy to follow
//    if the type checks fail or pass.
//
// Arguments:
//    comp               - Compiler instance
//    castHelper         - Cast helper call to expand
//    candidates         - [out] Classes (guesses) to use in the fast path (up to MAX_GDV_TYPE_CHECKS)
//    commonCls          - [out] Common denominator class for the fast and the fallback paths.
//    likelihoods        - [out] Likelihoods of successful type checks [0..100]
//    typeCheckFailed    - [out] Action to perform if the type check fails
//    typeCheckPassed    - [out] Action to perform if the type check passes
//
// Returns:
//    Likely class handle or NO_CLASS_HANDLE
//
static int PickCandidatesForTypeCheck(Compiler*              comp,
                                      GenTreeCall*           castHelper,
                                      CORINFO_CLASS_HANDLE*  candidates,
                                      CORINFO_CLASS_HANDLE*  commonCls,
                                      unsigned*              likelihoods,
                                      TypeCheckFailedAction* typeCheckFailed,
                                      TypeCheckPassedAction* typeCheckPassed)
{
    *typeCheckFailed = TypeCheckFailedAction::Unknown;
    *typeCheckPassed = TypeCheckPassedAction::Unknown;

    if (!castHelper->IsHelperCall() || ((castHelper->gtCallMoreFlags & GTF_CALL_M_CAST_CAN_BE_EXPANDED) == 0))
    {
        // It's not eligible for expansion (already expanded in importer)
        // To be removed once we move cast expansion here completely.
        return 0;
    }

    // Helper calls are never tail calls
    assert(!castHelper->IsTailCall());

    // is it "castclass" or "isinst"?
    bool isCastClass;

    const unsigned helper = castHelper->GetHelperNum();
    switch (helper)
    {
        case CORINFO_HELP_CHKCASTARRAY:
        case CORINFO_HELP_CHKCASTANY:
        case CORINFO_HELP_CHKCASTINTERFACE:
        case CORINFO_HELP_CHKCASTCLASS:
            isCastClass = true;
            break;

        case CORINFO_HELP_ISINSTANCEOFARRAY:
        case CORINFO_HELP_ISINSTANCEOFCLASS:
        case CORINFO_HELP_ISINSTANCEOFANY:
        case CORINFO_HELP_ISINSTANCEOFINTERFACE:
            isCastClass = false;
            break;

            // These are never expanded:
            // CORINFO_HELP_ISINSTANCEOF_EXCEPTION
            // CORINFO_HELP_CHKCASTCLASS_SPECIAL
            // CORINFO_HELP_READYTORUN_ISINSTANCEOF,
            // CORINFO_HELP_READYTORUN_CHKCAST,

            // Other helper calls are not cast helpers

        default:
            return 0;
    }

    // First, let's grab the expected class we're casting to/checking instance of:
    // E.g. "call CORINFO_HELP_ISINSTANCEOFCLASS(castToCls, obj)"
    GenTree*             clsArg    = castHelper->gtArgs.GetUserArgByIndex(0)->GetNode();
    GenTree*             objArg    = castHelper->gtArgs.GetUserArgByIndex(1)->GetNode();
    CORINFO_CLASS_HANDLE castToCls = comp->gtGetHelperArgClassHandle(clsArg);
    if (castToCls == NO_CLASS_HANDLE)
    {
        // If we don't see the constant class handle, we still can speculatively expand it
        // for castclass case (we'll just take the unknown tree as a type check tree)
        switch (helper)
        {
            case CORINFO_HELP_CHKCASTCLASS:
            case CORINFO_HELP_CHKCASTARRAY:
            case CORINFO_HELP_CHKCASTANY:
                likelihoods[0] = 50; // 50% speculative guess
                candidates[0]  = NO_CLASS_HANDLE;
                return 1;

            default:
                // Otherwise, bail out. We don't expect the constant handles to be CSE'd as they normally
                // have GTF_DONT_CSE flag set on them for cast helpers.
                // TODO-InlineCast: One missing case to handle is isinst against Class<_Canon>
                return 0;
        }
    }

    if ((objArg->gtFlags & GTF_ALL_EFFECT) != 0 && comp->lvaHaveManyLocals())
    {
        // TODO: Revise this:
        //  * Some casts are profitable even when ran out of tracked locals
        //  * We might want to use a shared local in all casts (similar to what we do boxing)
        JITDUMP("lvaHaveManyLocals() is true and objArg has side effects - bail out.")
        return 0;
    }

    // Assume that in the slow path (fallback) we'll always invoke the helper.
    // In some cases we can optimize this further e.g. either mark it additionally
    // as no-return (BBJ_THROW) or simply return null.
    *typeCheckFailed = TypeCheckFailedAction::CallHelper;

    // In most cases, a successful type check will return the object itself
    // there is just one exception - see below in PGO handling.
    *typeCheckPassed = TypeCheckPassedAction::ReturnObj;

    // A common denominator class for the fast and the fallback paths
    // can be used as a class for LCL_VAR storing the result of the expansion.
    // in most cases it's just the generic "cast to" class.
    *commonCls = castToCls;

    const unsigned isAbstractFlags = CORINFO_FLG_INTERFACE | CORINFO_FLG_ABSTRACT;

    // See what we already know about the type of the object being cast.
    bool                 fromClassIsExact   = false;
    bool                 fromClassIsNonNull = false;
    CORINFO_CLASS_HANDLE fromClass          = comp->gtGetClassHandle(objArg, &fromClassIsExact, &fromClassIsNonNull);
    if ((fromClass != NO_CLASS_HANDLE) && fromClassIsExact)
    {
        if (fromClassIsNonNull)
        {
            // An additional hint for the expansion that the object is not null
            castHelper->gtCallMoreFlags |= GTF_CALL_M_CAST_OBJ_NONNULL;
        }

        const TypeCompareState castResult = comp->info.compCompHnd->compareTypesForCast(fromClass, castToCls);
        if (isCastClass && (castResult == TypeCompareState::MustNot))
        {
            // The cast is guaranteed to fail, the expansion logic can skip the type check entirely
            *typeCheckPassed = TypeCheckPassedAction::CallHelper_AlwaysThrows;
            return 0;
        }

        // TODO-InlineCast:
        // isinst and MustNot         -> just return null
        // isinst/castclass and Must  -> just return obj
    }

    //
    // Now we need to figure out what classes to use for the fast path, we have 4 options:
    //  1) If "cast to" class is already exact we can go ahead and make some decisions
    //  2) If VM can tell us the exact class for this class/interface via getExactClasses - use it
    //     e.g. NativeAOT can promise us that for e.g. "foo is IMyInterface" foo can only ever be
    //     MyImpl and no other implementation of IMyInterface can be loaded dynamically.
    //  3) If we have PGO data and there is a dominating candidate - use it.
    //  4) Try to speculate and make optimistic guesses
    //
    // NOTE: only 2) supports multiple candidates at the moment.
    //

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // 1) If "cast to" class is already exact we can go ahead and make some decisions
    /////////////////////////////////////////////////////////////////////////////////////////////////////

    const bool isCastToExact = comp->info.compCompHnd->isExactType(castToCls);
    if (isCastToExact && ((helper == CORINFO_HELP_CHKCASTCLASS) || (helper == CORINFO_HELP_CHKCASTARRAY)))
    {
        // obj is string
        // obj is string[]
        //
        if ((helper == CORINFO_HELP_CHKCASTCLASS))
        {
            // (string)obj
            //
            // Fallback for this expansion always throws InvalidCastException
            // TODO: can we do the same for string[]? (importer did not)
            *typeCheckFailed = TypeCheckFailedAction::CallHelper_AlwaysThrows;

            // Assume that exceptions are rare
            likelihoods[0] = 100;

            // Update the common denominator class to be more exact as
            // the fallback always throws anyway, so we don't have to worry about what it returns.
            *commonCls = castToCls;
        }
        else
        {
            // 50% chance of successful type check (speculative guess)
            likelihoods[0] = 50;
        }
        candidates[0] = castToCls;
        return 1;
    }
    if (isCastToExact && ((helper == CORINFO_HELP_ISINSTANCEOFARRAY) || (helper == CORINFO_HELP_ISINSTANCEOFCLASS)))
    {
        // obj is string
        // obj is string[]
        //
        // Fallbacks for these expansions simply return null
        // TODO: should we keep the helper call for ISINSTANCEOFARRAY like we do for CHKCASTARRAY above?
        // The logic is copied from the importer.
        *typeCheckFailed = TypeCheckFailedAction::ReturnNull;

        // We're done, there is no need in consulting with PGO data
        candidates[0] = castToCls;
        // 50% chance of successful type check (speculative guess)
        likelihoods[0] = 50;
        return 1;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // 2) If VM can tell us the exact class for this "cast to" class - use it.
    //    Just make sure the class is truly exact.
    /////////////////////////////////////////////////////////////////////////////////////////////////////

    // Let's re-use GDV's threshold on how many guesses we can make (can be 3 by default).
    const int maxTypeChecks = min(comp->getGDVMaxTypeChecks(), MAX_GDV_TYPE_CHECKS);

    CORINFO_CLASS_HANDLE exactClasses[MAX_GDV_TYPE_CHECKS] = {};
    const int numExactClasses = comp->info.compCompHnd->getExactClasses(castToCls, maxTypeChecks, exactClasses);
    bool      allTrulyExact   = true;
    for (int i = 0; i < numExactClasses; i++)
    {
        // TODO-InlineCast: we shouldn't need isExactType check, otherwise we skip cases like this:
        //
        // if (obj is IMyInterface) { ... }
        //
        // class MyClass : IMyInterface
        // class MySubClass : MyClass
        //
        // (and both classes are the only ones implementing the interface)
        //
        if (!comp->info.compCompHnd->isExactType(exactClasses[i]))
        {
            allTrulyExact = false;
            break;
        }
    }

    if ((numExactClasses > 0) && allTrulyExact)
    {
        // We know all exact classes, but we don't know the likelihoods
        // TODO-InlineCast: check if we also have PGO data around (a rare case for getExactClasses,
        // since it's NativeAOT-only at the moment, and we rarely have static profile data for it).
        for (int i = 0; i < numExactClasses; i++)
        {
            likelihoods[i] = 100 / (numExactClasses + 1);
            // E.g. 3 classes - 25%,25%,25% and 25% for the fallback
            // A small integer rounding error is not a big deal here.
        }

        memcpy(candidates, exactClasses, numExactClasses * sizeof(CORINFO_CLASS_HANDLE));
        if ((helper == CORINFO_HELP_CHKCASTINTERFACE) || (helper == CORINFO_HELP_CHKCASTCLASS))
        {
            // Fallback call is only needed for castclass and only to throw InvalidCastException
            *typeCheckFailed = TypeCheckFailedAction::CallHelper_AlwaysThrows;

            // Since the fallback is cold, recalculate the likelihoods
            for (int i = 0; i < numExactClasses; i++)
            {
                likelihoods[i] = 100 / numExactClasses;
                // E.g. 3 classes - 33%,33%,33%
            }

            if (numExactClasses == 1)
            {
                // Update the common denominator class to be more exact
                // since the fallback always throws, and we have just one possible candidate.
                *commonCls = exactClasses[0];
            }
        }
        else if ((helper == CORINFO_HELP_ISINSTANCEOFINTERFACE) || (helper == CORINFO_HELP_ISINSTANCEOFCLASS))
        {
            // Fallback for isinst simply returns null here
            *typeCheckFailed = TypeCheckFailedAction::ReturnNull;

            if (numExactClasses == 1)
            {
                // Update the common denominator class to be more exact
                // since the fallback is just null, and we have just one possible candidate.
                *commonCls = exactClasses[0];
            }
        }
        return numExactClasses;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // 3) Consult with PGO data
    /////////////////////////////////////////////////////////////////////////////////////////////////////

    CORINFO_CLASS_HANDLE likelyClasses[MAX_GDV_TYPE_CHECKS]     = {};
    unsigned             likelyLikelihoods[MAX_GDV_TYPE_CHECKS] = {};
    int                  likelyClassCount                       = 0;
    comp->pickGDV(castHelper, castHelper->gtCastHelperILOffset, false, likelyClasses, nullptr, &likelyClassCount,
                  likelyLikelihoods);

    if (likelyClassCount > 0)
    {
        // TODO-InlineCast: support multiple candidates, for now always pick the first one
        // It's already supported, but it requires some careful performance testing to enable.
        //
        likelihoods[0] = likelyLikelihoods[0];
        candidates[0]  = likelyClasses[0];

        // Protect from stale static PGO data if a candidate becomes abstract/interface
        if ((likelyClasses[0] == NO_CLASS_HANDLE) ||
            (comp->info.compCompHnd->getClassAttribs(likelyClasses[0]) & isAbstractFlags) != 0)
        {
            return 0;
        }

        // We don't fully trust PGO data, so let's check it via compareTypesForCast
        const TypeCompareState castResult = comp->info.compCompHnd->compareTypesForCast(candidates[0], castToCls);
        if (castResult == TypeCompareState::May)
        {
            // TODO-InlineCast: do we need to check for May here? Conservatively assume that we do.
            JITDUMP("compareTypesForCast returned May for this candidate\n");
            return 0;
        }
        if (castResult == TypeCompareState::Must)
        {
            // return actual object on successful type check
            *typeCheckPassed = TypeCheckPassedAction::ReturnObj;
            return 1;
        }
        if (castResult == TypeCompareState::MustNot)
        {
            // Our likely candidate never passes the type check (may happen with PGO-driven expansion),
            if (!isCastClass)
            {
                // return null on successful type check
                *typeCheckPassed = TypeCheckPassedAction::ReturnNull;
                return 1;
            }
            // give up on castclass - it's going to throw InvalidCastException anyway
            return 0;
        }
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // 4) Last chance: speculative guesses
    /////////////////////////////////////////////////////////////////////////////////////////////////////

    switch (helper)
    {
        case CORINFO_HELP_CHKCASTINTERFACE:
            // Nothing to speculate here, e.g. (IDisposable)obj
            return 0;

        case CORINFO_HELP_CHKCASTARRAY:
        case CORINFO_HELP_CHKCASTCLASS:
        case CORINFO_HELP_CHKCASTANY:
            // These casts against exact classes are already handled above, so it's not exact here.
            //
            // let's use castToCls as a guess, we might regress some cases, but at least we know that unrelated
            // types are going to throw InvalidCastException, so we can assume the overhead happens rarely.
            //
            if ((comp->info.compCompHnd->getClassAttribs(castToCls) & isAbstractFlags) != 0)
            {
                // The guess is abstract - it will never pass the type check
                return 0;
            }
            candidates[0] = castToCls;
            // 50% chance of successful type check (speculative guess)
            likelihoods[0] = 50;
            //
            // A small optimization - use a slightly faster fallback which assumes that we've already checked
            // for null and for castToCls itself, so it won't do it again.
            //
            if (helper == CORINFO_HELP_CHKCASTCLASS)
            {
                *typeCheckFailed = TypeCheckFailedAction::CallHelper_Specialized;
            }
            return 1;

        case CORINFO_HELP_ISINSTANCEOFINTERFACE:
            // Nothing to speculate here, e.g. obj is IDisposable
            return 0;

        case CORINFO_HELP_ISINSTANCEOFARRAY:
            // ISINSTANCEOFARRAY against exact classes is already handled above, so it's not exact here.
            //
            //  obj is int[] - can we use int[] as a guess? No! It's an overhead if obj is uint[]
            //                 or any int-backed enum[]
            return 0;

        case CORINFO_HELP_ISINSTANCEOFCLASS:
            // ISINSTANCEOFCLASS against exact classes is already handled above, so it's not exact here.
            //
            //  obj is MyClass - can we use MyClass as a guess? No! It's an overhead for any other type except
            //                   MyClass and its subclasses - chances of hitting that overhead are too high.
            //
            return 0;

        case CORINFO_HELP_ISINSTANCEOFANY:
            // ditto + type variance, etc.
            return 0;

        default:
            unreached();
    }
}

//------------------------------------------------------------------------------
// fgLateCastExpansionForCall : Expand specific cast helper, see
//    fgLateCastExpansion's comments.
//
// Arguments:
//    block - Block containing the cast helper to expand
//    stmt  - Statement containing the cast helper
//    call  - The cast helper
//
// Returns:
//    True if expanded, false otherwise.
//
bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call)
{
    TypeCheckFailedAction typeCheckFailedAction;
    TypeCheckPassedAction typeCheckPassedAction;
    CORINFO_CLASS_HANDLE  commonCls;
    CORINFO_CLASS_HANDLE  expectedExactClasses[MAX_GDV_TYPE_CHECKS] = {};
    unsigned              likelihoods[MAX_GDV_TYPE_CHECKS]          = {};

    const int numOfCandidates = PickCandidatesForTypeCheck(this, call, expectedExactClasses, &commonCls, likelihoods,
                                                           &typeCheckFailedAction, &typeCheckPassedAction);
    if ((numOfCandidates == 0) && (typeCheckPassedAction != TypeCheckPassedAction::CallHelper_AlwaysThrows))
    {
        return false;
    }

    BasicBlock* block = *pBlock;
    JITDUMP("Expanding cast helper call in " FMT_BB "...\n", block->bbNum);
    DISPTREE(call);
    JITDUMP("\n");

    DebugInfo debugInfo = stmt->GetDebugInfo();

    BasicBlock*    firstBb;
    BasicBlock*    lastBb;
    const unsigned tmpNum = SplitAtTreeAndReplaceItWithLocal(this, block, stmt, call, &firstBb, &lastBb);

    // TODO-InlineCast: we can't set tmp's class because it's assigned to obj before we can make any assumptions
    // we need to slightly reshape the expansion to make it work. Although, it's possible that there is no value
    // in class in this late phase.
    // lvaSetClass(tmpNum, commonCls);

    GenTree* tmpNode = gtNewLclvNode(tmpNum, call->TypeGet());
    *pBlock          = lastBb;

    // We're going to expand this "isinst" like this:
    //
    // prevBb:                                      [weight: 1.0]
    //     ...
    //
    // nullcheckBb (BBJ_COND):                      [weight: 1.0]
    //     tmp = obj;
    //     if (tmp == null)
    //         goto lastBb;
    //
    // typeCheckBb (BBJ_COND):                      [weight: 0.5]
    //     if (tmp->pMT == likelyCls)
    //         goto typeCheckSucceedBb;
    //     goto (either next type check or fallbackBb)
    //
    // < there can be multiple typeCheckBbs >
    //
    // fallbackBb (BBJ_ALWAYS):                     [weight: <profile> or 0]
    //     tmp = helper_call(expectedCls, obj);
    //     goto lastBb;
    //     // NOTE: as an optimization we can omit the call and return null instead
    //     // or mark the call as no-return in certain cases.
    //
    //
    // typeCheckSucceedBb (BBJ_ALWAYS):             [weight: <profile>]
    //     no-op (or tmp = null; in case of 'MustNot')
    //
    // lastBb (BBJ_any):                            [weight: 1.0]
    //     use(tmp);
    //

    // NOTE: if the cast is known to always fail (TypeCheckPassedAction::CallHelper_AlwaysThrows)
    // we can omit the typeCheckBb and typeCheckSucceedBb and only have:
    //
    // if (obj == null) goto lastBb;
    // throw InvalidCastException;
    //
    // if obj is known to be non-null, then it will be just the throw block.

    // Block 1: nullcheckBb
    // TODO-InlineCast: assertionprop should leave us a mark that objArg is never null, so we can omit this check
    // it's too late to rely on upstream phases to do this for us (unless we do optRepeat).
    GenTree* nullcheckOp = gtNewOperNode(GT_EQ, TYP_INT, tmpNode, gtNewNull());
    nullcheckOp->gtFlags |= GTF_RELOP_JMP_USED;
    BasicBlock* nullcheckBb =
        fgNewBBFromTreeAfter(BBJ_COND, firstBb, gtNewOperNode(GT_JTRUE, TYP_VOID, nullcheckOp), debugInfo, true);

    // The very first statement in the whole expansion is to assign obj to tmp.
    // We assume it's the value we're going to return in most cases.
    GenTree*   originalObj = gtCloneExpr(call->gtArgs.GetUserArgByIndex(1)->GetNode());
    Statement* assignTmp   = fgNewStmtAtBeg(nullcheckBb, gtNewTempStore(tmpNum, originalObj), debugInfo);
    gtSetStmtInfo(assignTmp);
    fgSetStmtSeq(assignTmp);

    // Block 2: typeCheckBb(s)
    // TODO-InlineCast: if likelyCls == expectedCls we can consider saving to a local to re-use.

    BasicBlock* typeChecksBbs[MAX_GDV_TYPE_CHECKS] = {};
    BasicBlock* lastTypeCheckBb                    = nullcheckBb;
    for (int candidateId = 0; candidateId < numOfCandidates; candidateId++)
    {
        const CORINFO_CLASS_HANDLE expectedCls = expectedExactClasses[candidateId];
        // if expectedCls is NO_CLASS_HANDLE, it means we should just use the original clsArg
        GenTree* expectedClsNode = expectedCls != NO_CLASS_HANDLE
                                       ? gtNewIconEmbClsHndNode(expectedCls)
                                       : gtCloneExpr(call->gtArgs.GetUserArgByIndex(0)->GetNode());

        // Manually CSE the expectedClsNode for first type check if it's the same as the original clsArg
        // TODO-InlineCast: consider not doing this if the helper call is cold
        GenTree* storeCseVal = nullptr;
        if (candidateId == 0)
        {
            GenTree*& castArg = call->gtArgs.GetUserArgByIndex(0)->NodeRef();
            if (GenTree::Compare(castArg, expectedClsNode))
            {
                const unsigned clsTmp = lvaGrabTemp(true DEBUGARG("CSE for expectedClsNode"));
                storeCseVal           = gtNewTempStore(clsTmp, expectedClsNode);
                expectedClsNode       = gtNewLclvNode(clsTmp, TYP_I_IMPL);
                castArg               = gtNewLclvNode(clsTmp, TYP_I_IMPL);
            }
        }

        GenTree* mtCheck = gtNewOperNode(GT_EQ, TYP_INT, gtNewMethodTableLookup(gtCloneExpr(tmpNode)), expectedClsNode);
        mtCheck->gtFlags |= GTF_RELOP_JMP_USED;
        GenTree* jtrue             = gtNewOperNode(GT_JTRUE, TYP_VOID, mtCheck);
        typeChecksBbs[candidateId] = fgNewBBFromTreeAfter(BBJ_COND, lastTypeCheckBb, jtrue, debugInfo, true);
        lastTypeCheckBb            = typeChecksBbs[candidateId];

        // Insert the CSE node as the first statement in the block
        if (storeCseVal != nullptr)
        {
            Statement* clsStmt = fgNewStmtAtBeg(typeChecksBbs[0], storeCseVal, debugInfo);
            gtSetStmtInfo(clsStmt);
            fgSetStmtSeq(clsStmt);
        }
    }

    // numOfCandidates being 0 means that we don't need any type checks
    // as we already know that the cast is going to fail.
    const bool typeCheckNotNeeded = numOfCandidates == 0;

    // Block 3: fallbackBb
    BasicBlock* fallbackBb;
    if (typeCheckNotNeeded || (typeCheckFailedAction == TypeCheckFailedAction::CallHelper_AlwaysThrows))
    {
        // fallback call is used only to throw InvalidCastException
        setCallDoesNotReturn(call);
        fallbackBb = fgNewBBFromTreeAfter(BBJ_THROW, lastTypeCheckBb, call, debugInfo, true);
    }
    else if (typeCheckFailedAction == TypeCheckFailedAction::ReturnNull)
    {
        // if fallback call is not needed, we just assign null to tmp
        GenTree* fallbackTree = gtNewTempStore(tmpNum, gtNewNull());
        fallbackBb            = fgNewBBFromTreeAfter(BBJ_ALWAYS, lastTypeCheckBb, fallbackTree, debugInfo, true);
    }
    else
    {
        if (typeCheckFailedAction == TypeCheckFailedAction::CallHelper_Specialized)
        {
            // A slightly faster fallback which assumes that we've already checked
            // for null and for castToCls itself.
            call->gtCallMethHnd = eeFindHelper(CORINFO_HELP_CHKCASTCLASS_SPECIAL);
        }
        GenTree* fallbackTree = gtNewTempStore(tmpNum, call);
        fallbackBb            = fgNewBBFromTreeAfter(BBJ_ALWAYS, lastTypeCheckBb, fallbackTree, debugInfo, true);
    }

    // Block 4: typeCheckSucceedBb
    GenTree* typeCheckSucceedTree;
    if (typeCheckPassedAction == TypeCheckPassedAction::ReturnNull)
    {
        typeCheckSucceedTree = gtNewTempStore(tmpNum, gtNewNull());
    }
    else
    {
        // No-op because tmp was already assigned to obj
        typeCheckSucceedTree = gtNewNothingNode();
    }

    //
    // Wire up the blocks
    //

    // We assume obj is 50%/50% null/not-null (TODO-InlineCast: rely on PGO)
    // and rely on profile for the slow path.
    //
    // Alternatively we could profile nulls in the reservoir sample and
    // treat that as another "option".
    //
    // True out of the null check means obj is null.
    //
    const weight_t nullcheckTrueLikelihood  = 0.5;
    const weight_t nullcheckFalseLikelihood = 0.5;
    BasicBlock*    typeCheckSucceedBb;

    {
        FlowEdge* const trueEdge = fgAddRefPred(lastBb, nullcheckBb);
        nullcheckBb->SetTrueEdge(trueEdge);
        trueEdge->setLikelihood(nullcheckTrueLikelihood);
    }

    // Set nullcheckBb's weight here, so we can propagate it to its successors below
    nullcheckBb->inheritWeight(firstBb);

    if (typeCheckNotNeeded)
    {
        FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, nullcheckBb);
        nullcheckBb->SetFalseEdge(falseEdge);
        falseEdge->setLikelihood(nullcheckFalseLikelihood);
        fallbackBb->inheritWeight(nullcheckBb);
        fallbackBb->scaleBBWeight(nullcheckFalseLikelihood);
        lastBb->inheritWeight(nullcheckBb);
        lastBb->scaleBBWeight(nullcheckTrueLikelihood);

        typeCheckSucceedBb = nullptr;
    }
    else
    {
        FlowEdge* const falseEdge = fgAddRefPred(typeChecksBbs[0], nullcheckBb);
        nullcheckBb->SetFalseEdge(falseEdge);
        falseEdge->setLikelihood(nullcheckFalseLikelihood);

        typeCheckSucceedBb      = fgNewBBFromTreeAfter(BBJ_ALWAYS, fallbackBb, typeCheckSucceedTree, debugInfo);
        FlowEdge* const newEdge = fgAddRefPred(lastBb, typeCheckSucceedBb);
        typeCheckSucceedBb->SetTargetEdge(newEdge);
    }

    // Tricky case - wire up multiple type check blocks (in most cases there is only one)
    for (int candidateId = 0; candidateId < numOfCandidates; candidateId++)
    {
        BasicBlock* curTypeCheckBb = typeChecksBbs[candidateId];

        // All type checks jump straight to the typeCheckSucceedBb on success
        FlowEdge* const trueEdge = fgAddRefPred(typeCheckSucceedBb, curTypeCheckBb);
        curTypeCheckBb->SetTrueEdge(trueEdge);

        // or ...
        if (candidateId == numOfCandidates - 1)
        {
            // ... jump to the fallbackBb on last type check's failure
            FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, curTypeCheckBb);
            curTypeCheckBb->SetFalseEdge(falseEdge);
        }
        else
        {
            // ... jump to the next type check on failure
            FlowEdge* const falseEdge = fgAddRefPred(typeChecksBbs[candidateId + 1], curTypeCheckBb);
            curTypeCheckBb->SetFalseEdge(falseEdge);
        }
    }

    fgRedirectEdge(firstBb->TargetEdgeRef(), nullcheckBb);

    //
    // Re-distribute weights and set edge likelihoods.
    //
    // We have the likelihood estimate for each class to test against.
    // As with multi-guess GDV, once we've failed the first check, the
    // likelihood of the other checks will increase.
    //
    // For instance, suppose we have 3 classes A, B, C, with likelihoods 0.5, 0.2, 0.1,
    // and a residual 0.2 likelihood for none of the above.
    //
    // We first test for A, this is a 0.5 likelihood of success.
    //
    // If the test for A fails, we test for B. Because we only reach this second
    // test with relative likelihood 0.5, we need to divide (scale up) the remaining
    // likelihoods by 0.5, so we end up with 0.4, 0.2, (none of the above: 0.4).
    //
    // If the test for B also fails, we test for C. Because we only reach
    // this second test with relative likelihood 0.6, we need to divide (scale up)
    // the remaining likelihoods by by 0.6, so we end up with 0.33, (none of the above: 0.67).
    //
    // In the code below, instead of updating all the likelihoods each time,
    // we remember how much  we need to divide by to fix the next likelihood. This divisor is
    // 1.0 - (sumOfPreviousLikelihoods)
    //
    // So for the above, we have
    //
    //   Test  |  Likelihood  |  Sum of Previous  |   Rel. Likelihood
    //    A    |      0.5     |        0.0        |   0.5 / (1 - 0.0) = 0.50
    //    B    |      0.2     |        0.5        |   0.2 / (1 - 0.5) = 0.40
    //    C    |      0.1     |        0.7        |   0.1 / (1 - 0.7) = 0.33
    //   n/a   |      0.2     |        0.8        |   0.2 / (1 - 0.8) = 1.00
    //
    // The same goes for inherited weights -- the block where we test for B will have
    // the weight of A times the likelihood that A's test fails, etc.
    //
    weight_t sumOfPreviousLikelihood = 0;
    for (int candidateId = 0; candidateId < numOfCandidates; candidateId++)
    {
        BasicBlock* curTypeCheckBb = typeChecksBbs[candidateId];
        if (candidateId == 0)
        {
            // Predecessor is the nullcheck, control reaches on false.
            //
            curTypeCheckBb->inheritWeight(nullcheckBb);
            curTypeCheckBb->scaleBBWeight(nullcheckBb->GetFalseEdge()->getLikelihood());
        }
        else
        {
            // Predecessor is the prior type check, control reaches on false.
            //
            BasicBlock* const prevTypeCheckBb           = typeChecksBbs[candidateId - 1];
            weight_t          prevCheckFailedLikelihood = prevTypeCheckBb->GetFalseEdge()->getLikelihood();
            curTypeCheckBb->inheritWeight(prevTypeCheckBb);
            curTypeCheckBb->scaleBBWeight(prevCheckFailedLikelihood);
        }

        // Fix likelihood of block's outgoing edges.
        //
        weight_t likelihood    = (weight_t)likelihoods[candidateId] / 100;
        weight_t relLikelihood = likelihood / (1.0 - sumOfPreviousLikelihood);

        JITDUMP("Candidate %d: likelihood " FMT_WT " relative likelihood " FMT_WT "\n", candidateId, likelihood,
                relLikelihood);

        curTypeCheckBb->GetTrueEdge()->setLikelihood(relLikelihood);
        curTypeCheckBb->GetFalseEdge()->setLikelihood(1.0 - relLikelihood);
        sumOfPreviousLikelihood += likelihood;
    }

    fallbackBb->inheritWeight(lastTypeCheckBb);
    fallbackBb->scaleBBWeight(lastTypeCheckBb->GetFalseEdge()->getLikelihood());

    if (fallbackBb->KindIs(BBJ_ALWAYS))
    {
        FlowEdge* const newEdge = fgAddRefPred(lastBb, fallbackBb);
        fallbackBb->SetTargetEdge(newEdge);
    }

    if (!typeCheckNotNeeded)
    {
        typeCheckSucceedBb->inheritWeight(typeChecksBbs[0]);
        typeCheckSucceedBb->scaleBBWeight(sumOfPreviousLikelihood);
        lastBb->inheritWeight(firstBb);
    }

    //
    // Validate EH regions
    //
    assert(BasicBlock::sameEHRegion(firstBb, lastBb));
    assert(BasicBlock::sameEHRegion(firstBb, nullcheckBb));
    assert(BasicBlock::sameEHRegion(firstBb, fallbackBb));

    // call guarantees that obj is never null, we can drop the nullcheck
    // by converting it to a BBJ_ALWAYS to its false target.
    if ((call->gtCallMoreFlags & GTF_CALL_M_CAST_OBJ_NONNULL) != 0)
    {
        fgRemoveStmt(nullcheckBb, nullcheckBb->lastStmt());
        FlowEdge* const removedEdge = nullcheckBb->GetTrueEdge();
        fgRemoveRefPred(removedEdge);
        nullcheckBb->SetKindAndTargetEdge(BBJ_ALWAYS, nullcheckBb->GetFalseEdge());

        // Locally repair profile
        if (nullcheckBb->hasProfileWeight())
        {
            BasicBlock* const removedTarget = removedEdge->getDestinationBlock();
            if (removedTarget->bbPreds == nullptr)
            {
                // Unreachable path will be removed by the next flowgraph simplification pass.
                // For now, mark the profile as inconsistent to skip post-phase checks.
                JITDUMP("fgLateCastExpansionForCall: " FMT_BB
                        " is unreachable, and will be removed later. Data %s inconsistent.\n",
                        removedTarget->bbNum, fgPgoConsistent ? "is now" : "was already");
                fgPgoConsistent = false;
            }

            for (BasicBlock* const block : Blocks(nullcheckBb->Next(), lastBb))
            {
                block->setBBProfileWeight(block->computeIncomingWeight());
            }
        }
    }

    if (fallbackBb->KindIs(BBJ_THROW) && (fallbackBb->bbWeight != BB_ZERO_WEIGHT))
    {
        // This flow is disappearing.
        JITDUMP("fgLateCastExpansionForCall: fallback " FMT_BB " throws and has flow into it. Data %s inconsistent.\n",
                fallbackBb->bbNum, fgPgoConsistent ? "is now" : "was already");
        fgPgoConsistent = false;
    }

    // Bonus step: merge prevBb with nullcheckBb as they are likely to be mergeable
    if (fgCanCompactBlock(firstBb))
    {
        fgCompactBlock(firstBb);
    }

    return true;
}

//------------------------------------------------------------------------------
// fgExpandStackArrayAllocations : expand "new helpers" for stack arrays
//
// Returns:
//    PhaseStatus indicating what, if anything, was changed.
//
PhaseStatus Compiler::fgExpandStackArrayAllocations()
{
    PhaseStatus result = PhaseStatus::MODIFIED_NOTHING;

    if (!doesMethodHaveStackAllocatedArray())
    {
        // The method being compiled doesn't have any stack allocated arrays.
        return result;
    }

    // Find allocation sites, and transform them into initializations of the
    // array method table and length, and replace the allocation call with
    // the address of the local array.
    //
    bool modified = false;

    for (BasicBlock* const block : Blocks())
    {
        for (Statement* const stmt : block->Statements())
        {
            if ((stmt->GetRootNode()->gtFlags & GTF_CALL) == 0)
            {
                continue;
            }

            for (GenTree* const tree : stmt->TreeList())
            {
                if (!tree->IsCall())
                {
                    continue;
                }

                if (fgExpandStackArrayAllocation(block, stmt, tree->AsCall()))
                {
                    // If we expand, we split the statement's tree
                    // so will be done with this statment.
                    //
                    modified = true;
                    break;
                }
            }
        }
    }

    // we cant assert(modified) here as array allocation sites may
    // have been unreachable or dead-coded.
    //
    return modified ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING;
}

//------------------------------------------------------------------------------
// fgExpandStackArrayAllocation: expand new array helpers for stack allocated arrays
//
// Arguments:
//    block -  block containing the helper call to expand
//    stmt   - Statement containing the helper call
//    call   - The helper call
//
// Returns:
//    true if a runtime lookup was found and expanded.
//
bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, GenTreeCall* call)
{
    if (!call->IsHelperCall())
    {
        return false;
    }

    const CorInfoHelpFunc helper         = eeGetHelperNum(call->gtCallMethHnd);
    int                   lengthArgIndex = -1;
    int                   typeArgIndex   = -1;

    switch (helper)
    {
        case CORINFO_HELP_NEWARR_1_DIRECT:
        case CORINFO_HELP_NEWARR_1_VC:
        case CORINFO_HELP_NEWARR_1_PTR:
        case CORINFO_HELP_NEWARR_1_ALIGN8:
            lengthArgIndex = 1;
            typeArgIndex   = 0;
            break;

        default:
            return false;
    }

    // If this is a local array, the new helper will have an arg for the array's address
    //
    CallArg* const stackLocalAddressArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayLocal);

    if (stackLocalAddressArg == nullptr)
    {
        return false;
    }

    JITDUMP("Expanding new array helper for stack allocated array at [%06d] in " FMT_BB ":\n", dspTreeID(call),
            block->bbNum);
    DISPTREE(call);
    JITDUMP("\n");

    Statement* newStmt = nullptr;
    GenTree**  callUse = nullptr;
    bool       split   = gtSplitTree(block, stmt, call, &newStmt, &callUse);

    if (split)
    {
        while ((newStmt != nullptr) && (newStmt != stmt))
        {
            fgMorphStmtBlockOps(block, newStmt);
            newStmt = newStmt->GetNextStmt();
        }
    }

    GenTree* const stackLocalAddress = stackLocalAddressArg->GetNode();

    // Initialize the array method table pointer.
    //
    GenTree* const   mt      = call->gtArgs.GetArgByIndex(typeArgIndex)->GetNode();
    GenTree* const   mtStore = gtNewStoreValueNode(TYP_I_IMPL, stackLocalAddress, mt);
    Statement* const mtStmt  = fgNewStmtFromTree(mtStore);

    fgInsertStmtBefore(block, stmt, mtStmt);

    // Initialize the array length.
    //
    GenTree* const   lengthArg     = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode();
    GenTree* const   lengthArgInt  = fgOptimizeCast(gtNewCastNode(TYP_INT, lengthArg, false, TYP_INT));
    GenTree* const   lengthAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(stackLocalAddress),
                                                   gtNewIconNode(OFFSETOF__CORINFO_Array__length, TYP_I_IMPL));
    GenTree* const   lengthStore   = gtNewStoreValueNode(TYP_INT, lengthAddress, lengthArgInt);
    Statement* const lenStmt       = fgNewStmtFromTree(lengthStore);

    fgInsertStmtBefore(block, stmt, lenStmt);

    // Replace call with local address
    //
    *callUse = gtCloneExpr(stackLocalAddress);
    DEBUG_DESTROY_NODE(call);

    fgMorphStmtBlockOps(block, stmt);
    gtUpdateStmtSideEffects(stmt);

    return true;
}
