codegen/x86/CodegenInterface.cpp

/*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include <sys/mman.h>
#include "Dalvik.h"
#include "libdex/DexOpcodes.h"
#include "compiler/Compiler.h"
#include "compiler/CompilerIR.h"
#include "interp/Jit.h"
#include "libdex/DexFile.h"
#include "Lower.h"
#include "NcgAot.h"
#include "compiler/codegen/CompilerCodegen.h"

/* Init values when a predicted chain is initially assembled */
/* E7FE is branch to self */
#define PREDICTED_CHAIN_BX_PAIR_INIT     0xe7fe

/* Target-specific save/restore */
extern "C" void dvmJitCalleeSave(double *saveArea);
extern "C" void dvmJitCalleeRestore(double *saveArea);

/*
 * Determine the initial instruction set to be used for this trace.
 * Later components may decide to change this.
 */
//JitInstructionSetType dvmCompilerInstructionSet(CompilationUnit *cUnit)
JitInstructionSetType dvmCompilerInstructionSet(void)
{
    return DALVIK_JIT_IA32;
}

JitInstructionSetType dvmCompilerGetInterpretTemplateSet()
{
    return DALVIK_JIT_IA32;
}

/* we don't use template for IA32 */
void *dvmCompilerGetInterpretTemplate()
{
      return NULL;
}

/* Track the number of times that the code cache is patched */
#if defined(WITH_JIT_TUNING)
#define UPDATE_CODE_CACHE_PATCHES()    (gDvmJit.codeCachePatches++)
#else
#define UPDATE_CODE_CACHE_PATCHES()
#endif

bool dvmCompilerArchInit() {
    /* Target-specific configuration */
    gDvmJit.jitTableSize = 1 << 12;
    gDvmJit.jitTableMask = gDvmJit.jitTableSize - 1;
    if (gDvmJit.threshold == 0) {
        gDvmJit.threshold = 255;
    }
    gDvmJit.codeCacheSize = 512*1024;
    gDvmJit.optLevel = kJitOptLevelO1;

    //Disable Method-JIT
    gDvmJit.disableOpt |= (1 << kMethodJit);

#if defined(WITH_SELF_VERIFICATION)
    /* Force into blocking mode */
    gDvmJit.blockingMode = true;
    gDvm.nativeDebuggerActive = true;
#endif

    // Make sure all threads have current values
    dvmJitUpdateThreadStateAll();

    return true;
}

void dvmCompilerPatchInlineCache(void)
{
    int i;
    PredictedChainingCell *minAddr, *maxAddr;

    /* Nothing to be done */
    if (gDvmJit.compilerICPatchIndex == 0) return;

    /*
     * Since all threads are already stopped we don't really need to acquire
     * the lock. But race condition can be easily introduced in the future w/o
     * paying attention so we still acquire the lock here.
     */
    dvmLockMutex(&gDvmJit.compilerICPatchLock);

    UNPROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);

    //ALOGD("Number of IC patch work orders: %d", gDvmJit.compilerICPatchIndex);

    /* Initialize the min/max address range */
    minAddr = (PredictedChainingCell *)
        ((char *) gDvmJit.codeCache + gDvmJit.codeCacheSize);
    maxAddr = (PredictedChainingCell *) gDvmJit.codeCache;

    for (i = 0; i < gDvmJit.compilerICPatchIndex; i++) {
        ICPatchWorkOrder *workOrder = &gDvmJit.compilerICPatchQueue[i];
        PredictedChainingCell *cellAddr = workOrder->cellAddr;
        PredictedChainingCell *cellContent = &workOrder->cellContent;
        ClassObject *clazz = dvmFindClassNoInit(workOrder->classDescriptor,
                                                workOrder->classLoader);

        assert(clazz->serialNumber == workOrder->serialNumber);

        /* Use the newly resolved clazz pointer */
        cellContent->clazz = clazz;

        if (cellAddr->clazz == NULL) {
            COMPILER_TRACE_CHAINING(
                ALOGI("Jit Runtime: predicted chain %p to %s (%s) initialized",
                      cellAddr,
                      cellContent->clazz->descriptor,
                      cellContent->method->name));
        } else {
            COMPILER_TRACE_CHAINING(
                ALOGI("Jit Runtime: predicted chain %p from %s to %s (%s) "
                      "patched",
                      cellAddr,
                      cellAddr->clazz->descriptor,
                      cellContent->clazz->descriptor,
                      cellContent->method->name));
        }

        /* Patch the chaining cell */
        *cellAddr = *cellContent;
        minAddr = (cellAddr < minAddr) ? cellAddr : minAddr;
        maxAddr = (cellAddr > maxAddr) ? cellAddr : maxAddr;
    }

    PROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);

    gDvmJit.compilerICPatchIndex = 0;
    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
}

/* Target-specific cache clearing */
void dvmCompilerCacheClear(char *start, size_t size)
{
    /* "0xFF 0xFF" is an invalid opcode for x86. */
    memset(start, 0xFF, size);
}

/* for JIT debugging, to be implemented */
void dvmJitCalleeSave(double *saveArea) {
}

void dvmJitCalleeRestore(double *saveArea) {
}

void dvmJitToInterpSingleStep() {
}

JitTraceDescription *dvmCopyTraceDescriptor(const u2 *pc,
                                            const JitEntry *knownEntry) {
    return NULL;
}

void dvmCompilerCodegenDump(CompilationUnit *cUnit) //in ArchUtility.c
{
}

void dvmCompilerArchDump(void)
{
}

char *getTraceBase(const JitEntry *p)
{
    return NULL;
}

void dvmCompilerAssembleLIR(CompilationUnit *cUnit, JitTranslationInfo* info)
{
}

void dvmJitInstallClassObjectPointers(CompilationUnit *cUnit, char *codeAddress)
{
}

void dvmCompilerMethodMIR2LIR(CompilationUnit *cUnit)
{
    // Method-based JIT not supported for x86.
}

void dvmJitScanAllClassPointers(void (*callback)(void *))
{
}

/* Handy function to retrieve the profile count */
static inline int getProfileCount(const JitEntry *entry)
{
    if (entry->dPC == 0 || entry->codeAddress == 0)
        return 0;
    u4 *pExecutionCount = (u4 *) getTraceBase(entry);

    return pExecutionCount ? *pExecutionCount : 0;
}

/* qsort callback function */
static int sortTraceProfileCount(const void *entry1, const void *entry2)
{
    const JitEntry *jitEntry1 = (const JitEntry *)entry1;
    const JitEntry *jitEntry2 = (const JitEntry *)entry2;

    JitTraceCounter_t count1 = getProfileCount(jitEntry1);
    JitTraceCounter_t count2 = getProfileCount(jitEntry2);
    return (count1 == count2) ? 0 : ((count1 > count2) ? -1 : 1);
}

/* Sort the trace profile counts and dump them */
void dvmCompilerSortAndPrintTraceProfiles() //in Assemble.c
{
    JitEntry *sortedEntries;
    int numTraces = 0;
    unsigned long counts = 0;
    unsigned int i;

    /* Make sure that the table is not changing */
    dvmLockMutex(&gDvmJit.tableLock);

    /* Sort the entries by descending order */
    sortedEntries = (JitEntry *)malloc(sizeof(JitEntry) * gDvmJit.jitTableSize);
    if (sortedEntries == NULL)
        goto done;
    memcpy(sortedEntries, gDvmJit.pJitEntryTable,
           sizeof(JitEntry) * gDvmJit.jitTableSize);
    qsort(sortedEntries, gDvmJit.jitTableSize, sizeof(JitEntry),
          sortTraceProfileCount);

    /* Dump the sorted entries */
    for (i=0; i < gDvmJit.jitTableSize; i++) {
        if (sortedEntries[i].dPC != 0) {
            numTraces++;
        }
    }
    if (numTraces == 0)
        numTraces = 1;
    ALOGI("JIT: Average execution count -> %d",(int)(counts / numTraces));

    free(sortedEntries);
done:
    dvmUnlockMutex(&gDvmJit.tableLock);
    return;
}

void jumpWithRelOffset(char* instAddr, int relOffset) {
    stream = instAddr;
    OpndSize immSize = estOpndSizeFromImm(relOffset);
    relOffset -= getJmpCallInstSize(immSize, JmpCall_uncond);
    dump_imm(Mnemonic_JMP, immSize, relOffset);
}

// works whether instructions for target basic block are generated or not
LowOp* jumpToBasicBlock(char* instAddr, int targetId) {
    stream = instAddr;
    bool unknown;
    OpndSize size;
    int relativeNCG = targetId;
    relativeNCG = getRelativeNCG(targetId, JmpCall_uncond, &unknown, &size);
    unconditional_jump_int(relativeNCG, size);
    return NULL;
}

LowOp* condJumpToBasicBlock(char* instAddr, ConditionCode cc, int targetId) {
    stream = instAddr;
    bool unknown;
    OpndSize size;
    int relativeNCG = targetId;
    relativeNCG = getRelativeNCG(targetId, JmpCall_cond, &unknown, &size);
    conditional_jump_int(cc, relativeNCG, size);
    return NULL;
}

/*
 * Attempt to enqueue a work order to patch an inline cache for a predicted
 * chaining cell for virtual/interface calls.
 */
static bool inlineCachePatchEnqueue(PredictedChainingCell *cellAddr,
                                    PredictedChainingCell *newContent)
{
    bool result = true;

    /*
     * Make sure only one thread gets here since updating the cell (ie fast
     * path and queueing the request (ie the queued path) have to be done
     * in an atomic fashion.
     */
    dvmLockMutex(&gDvmJit.compilerICPatchLock);

    /* Fast path for uninitialized chaining cell */
    if (cellAddr->clazz == NULL &&
        cellAddr->branch == PREDICTED_CHAIN_BX_PAIR_INIT) {
        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

        cellAddr->method = newContent->method;
        cellAddr->branch = newContent->branch;
        cellAddr->branch2 = newContent->branch2;

        /*
         * The update order matters - make sure clazz is updated last since it
         * will bring the uninitialized chaining cell to life.
         */
        android_atomic_release_store((int32_t)newContent->clazz,
            (volatile int32_t *)(void*) &cellAddr->clazz);
        //cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
        UPDATE_CODE_CACHE_PATCHES();

        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

#if 0
        MEM_BARRIER();
        cellAddr->clazz = newContent->clazz;
        //cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
#endif
#if defined(WITH_JIT_TUNING)
        gDvmJit.icPatchInit++;
#endif
        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: FAST predicted chain %p to method %s%s %p",
                  cellAddr, newContent->clazz->descriptor, newContent->method->name, newContent->method));
    /* Check if this is a frequently missed clazz */
    } else if (cellAddr->stagedClazz != newContent->clazz) {
        /* Not proven to be frequent yet - build up the filter cache */
        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

        cellAddr->stagedClazz = newContent->clazz;

        UPDATE_CODE_CACHE_PATCHES();
        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

#if defined(WITH_JIT_TUNING)
        gDvmJit.icPatchRejected++;
#endif
    /*
     * Different classes but same method implementation - it is safe to just
     * patch the class value without the need to stop the world.
     */
    } else if (cellAddr->method == newContent->method) {
        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

        cellAddr->clazz = newContent->clazz;
        /* No need to flush the cache here since the branch is not patched */
        UPDATE_CODE_CACHE_PATCHES();

        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));

#if defined(WITH_JIT_TUNING)
        gDvmJit.icPatchLockFree++;
#endif
    /*
     * Cannot patch the chaining cell inline - queue it until the next safe
     * point.
     */
    } else if (gDvmJit.compilerICPatchIndex < COMPILER_IC_PATCH_QUEUE_SIZE)  {
        int index = gDvmJit.compilerICPatchIndex++;
        const ClassObject *clazz = newContent->clazz;

        gDvmJit.compilerICPatchQueue[index].cellAddr = cellAddr;
        gDvmJit.compilerICPatchQueue[index].cellContent = *newContent;
        gDvmJit.compilerICPatchQueue[index].classDescriptor = clazz->descriptor;
        gDvmJit.compilerICPatchQueue[index].classLoader = clazz->classLoader;
        /* For verification purpose only */
        gDvmJit.compilerICPatchQueue[index].serialNumber = clazz->serialNumber;

#if defined(WITH_JIT_TUNING)
        gDvmJit.icPatchQueued++;
#endif
        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: QUEUE predicted chain %p to method %s%s",
                  cellAddr, newContent->clazz->descriptor, newContent->method->name));
    } else {
    /* Queue is full - just drop this patch request */
#if defined(WITH_JIT_TUNING)
        gDvmJit.icPatchDropped++;
#endif

        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: DROP predicted chain %p to method %s%s",
                  cellAddr, newContent->clazz->descriptor, newContent->method->name));
    }

    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
    return result;
}

/*
 * This method is called from the invoke templates for virtual and interface
 * methods to speculatively setup a chain to the callee. The templates are
 * written in assembly and have setup method, cell, and clazz at r0, r2, and
 * r3 respectively, so there is a unused argument in the list. Upon return one
 * of the following three results may happen:
 *   1) Chain is not setup because the callee is native. Reset the rechain
 *      count to a big number so that it will take a long time before the next
 *      rechain attempt to happen.
 *   2) Chain is not setup because the callee has not been created yet. Reset
 *      the rechain count to a small number and retry in the near future.
 *   3) Ask all other threads to stop before patching this chaining cell.
 *      This is required because another thread may have passed the class check
 *      but hasn't reached the chaining cell yet to follow the chain. If we
 *      patch the content before halting the other thread, there could be a
 *      small window for race conditions to happen that it may follow the new
 *      but wrong chain to invoke a different method.
 */
const Method *dvmJitToPatchPredictedChain(const Method *method,
                                          Thread *self,
                                          PredictedChainingCell *cell,
                                          const ClassObject *clazz)
{
    int newRechainCount = PREDICTED_CHAIN_COUNTER_RECHAIN;
    /* Don't come back here for a long time if the method is native */
    if (dvmIsNativeMethod(method)) {
        UNPROTECT_CODE_CACHE(cell, sizeof(*cell));

        /*
         * Put a non-zero/bogus value in the clazz field so that it won't
         * trigger immediate patching and will continue to fail to match with
         * a real clazz pointer.
         */
        cell->clazz = (ClassObject *) PREDICTED_CHAIN_FAKE_CLAZZ;

        UPDATE_CODE_CACHE_PATCHES();
        PROTECT_CODE_CACHE(cell, sizeof(*cell));
        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: predicted chain %p to native method %s ignored",
                  cell, method->name));
        goto done;
    }
    {
    int tgtAddr = (int) dvmJitGetTraceAddr(method->insns);

    /*
     * Compilation not made yet for the callee. Reset the counter to a small
     * value and come back to check soon.
     */
    if ((tgtAddr == 0) ||
        ((void*)tgtAddr == dvmCompilerGetInterpretTemplate())) {
        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: predicted chain %p to method %s%s delayed",
                  cell, method->clazz->descriptor, method->name));
        goto done;
    }

    PredictedChainingCell newCell;

    if (cell->clazz == NULL) {
        newRechainCount = self->icRechainCount;
    }

    int relOffset = (int) tgtAddr - (int)cell;
    OpndSize immSize = estOpndSizeFromImm(relOffset);
    int jumpSize = getJmpCallInstSize(immSize, JmpCall_uncond);
    relOffset -= jumpSize;
    COMPILER_TRACE_CHAINING(
            ALOGI("inlineCachePatchEnqueue chain %p to method %s%s inst size %d",
                  cell, method->clazz->descriptor, method->name, jumpSize));
    //can't use stream here since it is used by the compilation thread
    dump_imm_with_codeaddr(Mnemonic_JMP, immSize, relOffset, (char*) (&newCell)); //update newCell.branch

    newCell.clazz = clazz;
    newCell.method = method;

    /*
     * Enter the work order to the queue and the chaining cell will be patched
     * the next time a safe point is entered.
     *
     * If the enqueuing fails reset the rechain count to a normal value so that
     * it won't get indefinitely delayed.
     */
    inlineCachePatchEnqueue(cell, &newCell);
    }
done:
    self->icRechainCount = newRechainCount;
    return method;
}

/*
 * Unchain a trace given the starting address of the translation
 * in the code cache.  Refer to the diagram in dvmCompilerAssembleLIR.
 * For ARM, it returns the address following the last cell unchained.
 * For IA, it returns NULL since cacheflush is not required for IA.
 */
u4* dvmJitUnchain(void* codeAddr)
{
    /* codeAddr is 4-byte aligned, so is chain cell count offset */
    u2* pChainCellCountOffset = (u2*)((char*)codeAddr - 4);
    u2 chainCellCountOffset = *pChainCellCountOffset;
    /* chain cell counts information is 4-byte aligned */
    ChainCellCounts *pChainCellCounts =
          (ChainCellCounts*)((char*)codeAddr + chainCellCountOffset);
    u2* pChainCellOffset = (u2*)((char*)codeAddr - 2);
    u2 chainCellOffset = *pChainCellOffset;
    u1* pChainCells;
    int i,j;
    PredictedChainingCell *predChainCell;
    int padding;

    /* Locate the beginning of the chain cell region */
    pChainCells = (u1 *)((char*)codeAddr + chainCellOffset);

    /* The cells are sorted in order - walk through them and reset */
    for (i = 0; i < kChainingCellGap; i++) {
        /* for hot, normal, singleton chaining:
               nop  //padding.
               jmp 0
               mov imm32, reg1
               mov imm32, reg2
               call reg2
           after chaining:
               nop
               jmp imm
               mov imm32, reg1
               mov imm32, reg2
               call reg2
           after unchaining:
               nop
               jmp 0
               mov imm32, reg1
               mov imm32, reg2
               call reg2
           Space occupied by the chaining cell in bytes: nop is for padding,
                jump 0, the target 0 is 4 bytes aligned.
           Space for predicted chaining: 5 words = 20 bytes
        */
        int elemSize = 0;
        if (i == kChainingCellInvokePredicted) {
            elemSize = 20;
        }
        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: unchaining type %d count %d", i, pChainCellCounts->u.count[i]));

        for (j = 0; j < pChainCellCounts->u.count[i]; j++) {
            switch(i) {
                case kChainingCellNormal:
                case kChainingCellHot:
                case kChainingCellInvokeSingleton:
                case kChainingCellBackwardBranch:
                    COMPILER_TRACE_CHAINING(
                        ALOGI("Jit Runtime: unchaining of normal, hot, or singleton"));
                    pChainCells = (u1*) (((uint)pChainCells + 4)&(~0x03));
                    elemSize = 4+5+5+2;
                    memset(pChainCells, 0, 4);
                    break;
                case kChainingCellInvokePredicted:
                    COMPILER_TRACE_CHAINING(
                        ALOGI("Jit Runtime: unchaining of predicted"));
                    /* 4-byte aligned */
                    padding = (4 - ((u4)pChainCells & 3)) & 3;
                    pChainCells += padding;
                    predChainCell = (PredictedChainingCell *) pChainCells;
                    /*
                     * There could be a race on another mutator thread to use
                     * this particular predicted cell and the check has passed
                     * the clazz comparison. So we cannot safely wipe the
                     * method and branch but it is safe to clear the clazz,
                     * which serves as the key.
                     */
                    predChainCell->clazz = PREDICTED_CHAIN_CLAZZ_INIT;
                    break;
                default:
                    ALOGE("Unexpected chaining type: %d", i);
                    dvmAbort();  // dvmAbort OK here - can't safely recover
            }
            COMPILER_TRACE_CHAINING(
                ALOGI("Jit Runtime: unchaining 0x%x", (int)pChainCells));
            pChainCells += elemSize;  /* Advance by a fixed number of bytes */
        }
    }
    return NULL;
}

/* Unchain all translation in the cache. */
void dvmJitUnchainAll()
{
    ALOGV("Jit Runtime: unchaining all");
    if (gDvmJit.pJitEntryTable != NULL) {
        COMPILER_TRACE_CHAINING(ALOGI("Jit Runtime: unchaining all"));
        dvmLockMutex(&gDvmJit.tableLock);

        UNPROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);

        for (size_t i = 0; i < gDvmJit.jitTableSize; i++) {
            if (gDvmJit.pJitEntryTable[i].dPC &&
                !gDvmJit.pJitEntryTable[i].u.info.isMethodEntry &&
                gDvmJit.pJitEntryTable[i].codeAddress) {
                      dvmJitUnchain(gDvmJit.pJitEntryTable[i].codeAddress);
            }
        }

        PROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);

        dvmUnlockMutex(&gDvmJit.tableLock);
        gDvmJit.translationChains = 0;
    }
    gDvmJit.hasNewChain = false;
}

#define P_GPR_1 PhysicalReg_EBX
/* Add an additional jump instruction, keep jump target 4 bytes aligned.*/
static void insertJumpHelp()
{
    int rem = (uint)stream % 4;
    int nop_size = 3 - rem;
    dump_nop(nop_size);
    unconditional_jump_int(0, OpndSize_32);
    return;
}

/* Chaining cell for code that may need warmup. */
/* ARM assembly: ldr r0, [r6, #76] (why a single instruction to access member of glue structure?)
                 blx r0
                 data 0xb23a //bytecode address: 0x5115b23a
                 data 0x5115
   IA32 assembly:
                  jmp  0 //5 bytes
                  movl address, %ebx
                  movl dvmJitToInterpNormal, %eax
                  call %eax
                  <-- return address
*/
static void handleNormalChainingCell(CompilationUnit *cUnit,
                                     unsigned int offset, int blockId, LowOpBlockLabel* labelList)
{
    ALOGV("in handleNormalChainingCell for method %s block %d BC offset %x NCG offset %x",
          cUnit->method->name, blockId, offset, stream - streamMethodStart);
    if(dump_x86_inst)
        ALOGI("LOWER NormalChainingCell at offsetPC %x offsetNCG %x @%p",
              offset, stream - streamMethodStart, stream);
    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
     * reslove the multithreading issue.
     */
    insertJumpHelp();
    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
    scratchRegs[0] = PhysicalReg_EAX;
    call_dvmJitToInterpNormal();
    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
}

/*
 * Chaining cell for instructions that immediately following already translated
 * code.
 */
static void handleHotChainingCell(CompilationUnit *cUnit,
                                  unsigned int offset, int blockId, LowOpBlockLabel* labelList)
{
    ALOGV("in handleHotChainingCell for method %s block %d BC offset %x NCG offset %x",
          cUnit->method->name, blockId, offset, stream - streamMethodStart);
    if(dump_x86_inst)
        ALOGI("LOWER HotChainingCell at offsetPC %x offsetNCG %x @%p",
              offset, stream - streamMethodStart, stream);
    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
     * reslove the multithreading issue.
     */
    insertJumpHelp();
    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
    scratchRegs[0] = PhysicalReg_EAX;
    call_dvmJitToInterpTraceSelect();
    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
}

/* Chaining cell for branches that branch back into the same basic block */
static void handleBackwardBranchChainingCell(CompilationUnit *cUnit,
                                     unsigned int offset, int blockId, LowOpBlockLabel* labelList)
{
    ALOGV("in handleBackwardBranchChainingCell for method %s block %d BC offset %x NCG offset %x",
          cUnit->method->name, blockId, offset, stream - streamMethodStart);
    if(dump_x86_inst)
        ALOGI("LOWER BackwardBranchChainingCell at offsetPC %x offsetNCG %x @%p",
              offset, stream - streamMethodStart, stream);
    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
     * reslove the multithreading issue.
     */
    insertJumpHelp();
    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
    scratchRegs[0] = PhysicalReg_EAX;
    call_dvmJitToInterpNormal();
    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
}

/* Chaining cell for monomorphic method invocations. */
static void handleInvokeSingletonChainingCell(CompilationUnit *cUnit,
                                              const Method *callee, int blockId, LowOpBlockLabel* labelList)
{
    ALOGV("in handleInvokeSingletonChainingCell for method %s block %d callee %s NCG offset %x",
          cUnit->method->name, blockId, callee->name, stream - streamMethodStart);
    if(dump_x86_inst)
        ALOGI("LOWER InvokeSingletonChainingCell at block %d offsetNCG %x @%p",
              blockId, stream - streamMethodStart, stream);
    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
     * reslove the multithreading issue.
     */
    insertJumpHelp();
    move_imm_to_reg(OpndSize_32, (int) (callee->insns), P_GPR_1, true);
    scratchRegs[0] = PhysicalReg_EAX;
    call_dvmJitToInterpTraceSelect();
    //move_imm_to_reg(OpndSize_32, (int) (callee->insns), P_GPR_1, true); /* used when unchaining */
}
#undef P_GPR_1

/* Chaining cell for monomorphic method invocations. */
static void handleInvokePredictedChainingCell(CompilationUnit *cUnit, int blockId)
{
    if(dump_x86_inst)
        ALOGI("LOWER InvokePredictedChainingCell at block %d offsetNCG %x @%p",
              blockId, stream - streamMethodStart, stream);
#ifndef PREDICTED_CHAINING
    //assume rPC for callee->insns in %ebx
    scratchRegs[0] = PhysicalReg_EAX;
#if defined(WITH_JIT_TUNING)
    /* Predicted chaining is not enabled. Fall back to interpreter and
     * indicate that predicted chaining was not done.
     */
    move_imm_to_reg(OpndSize_32, kInlineCacheMiss, PhysicalReg_EDX, true);
#endif
    call_dvmJitToInterpTraceSelectNoChain();
#else
    /* make sure section for predicited chaining cell is 4-byte aligned */
    //int padding = (4 - ((u4)stream & 3)) & 3;
    //stream += padding;
    int* streamData = (int*)stream;
    /* Should not be executed in the initial state */
    streamData[0] = PREDICTED_CHAIN_BX_PAIR_INIT;
    streamData[1] = 0;
    /* To be filled: class */
    streamData[2] = PREDICTED_CHAIN_CLAZZ_INIT;
    /* To be filled: method */
    streamData[3] = PREDICTED_CHAIN_METHOD_INIT;
    /*
     * Rechain count. The initial value of 0 here will trigger chaining upon
     * the first invocation of this callsite.
     */
    streamData[4] = PREDICTED_CHAIN_COUNTER_INIT;
#if 0
    ALOGI("--- DATA @ %p: %x %x %x %x", stream, *((int*)stream), *((int*)(stream+4)),
          *((int*)(stream+8)), *((int*)(stream+12)));
#endif
    stream += 20; //5 *4
#endif
}

/* Load the Dalvik PC into r0 and jump to the specified target */
static void handlePCReconstruction(CompilationUnit *cUnit,
                                   LowOpBlockLabel *targetLabel)
{
#if 0
    LowOp **pcrLabel =
        (LowOp **) cUnit->pcReconstructionList.elemList;
    int numElems = cUnit->pcReconstructionList.numUsed;
    int i;
    for (i = 0; i < numElems; i++) {
        dvmCompilerAppendLIR(cUnit, (LIR *) pcrLabel[i]);
        /* r0 = dalvik PC */
        loadConstant(cUnit, r0, pcrLabel[i]->operands[0]);
        genUnconditionalBranch(cUnit, targetLabel);
    }
#endif
}

//use O0 code generator for hoisted checks outside of the loop
/*
 * vA = arrayReg;
 * vB = idxReg;
 * vC = endConditionReg;
 * arg[0] = maxC
 * arg[1] = minC
 * arg[2] = loopBranchConditionCode
 */
#define P_GPR_1 PhysicalReg_EBX
#define P_GPR_2 PhysicalReg_ECX
static void genHoistedChecksForCountUpLoop(CompilationUnit *cUnit, MIR *mir)
{
    /*
     * NOTE: these synthesized blocks don't have ssa names assigned
     * for Dalvik registers.  However, because they dominate the following
     * blocks we can simply use the Dalvik name w/ subscript 0 as the
     * ssa name.
     */
    DecodedInstruction *dInsn = &mir->dalvikInsn;
    const int maxC = dInsn->arg[0];

    /* assign array in virtual register to P_GPR_1 */
    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true);
    /* assign index in virtual register to P_GPR_2 */
    get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, P_GPR_2, true);
    export_pc();
    compare_imm_reg(OpndSize_32, 0, P_GPR_1, true);
    condJumpToBasicBlock(stream, Condition_E, cUnit->exceptionBlockId);
    int delta = maxC;
    /*
     * If the loop end condition is ">=" instead of ">", then the largest value
     * of the index is "endCondition - 1".
     */
    if (dInsn->arg[2] == OP_IF_GE) {
        delta--;
    }

    if (delta < 0) { //+delta
        //if P_GPR_2 is mapped to a VR, we can't do this
        alu_binary_imm_reg(OpndSize_32, sub_opc, -delta, P_GPR_2, true);
    } else if(delta > 0) {
        alu_binary_imm_reg(OpndSize_32, add_opc, delta, P_GPR_2, true);
    }
    compare_mem_reg(OpndSize_32, offArrayObject_length, P_GPR_1, true, P_GPR_2, true);
    condJumpToBasicBlock(stream, Condition_NC, cUnit->exceptionBlockId);
}

/*
 * vA = arrayReg;
 * vB = idxReg;
 * vC = endConditionReg;
 * arg[0] = maxC
 * arg[1] = minC
 * arg[2] = loopBranchConditionCode
 */
static void genHoistedChecksForCountDownLoop(CompilationUnit *cUnit, MIR *mir)
{
    DecodedInstruction *dInsn = &mir->dalvikInsn;
    const int maxC = dInsn->arg[0];

    /* assign array in virtual register to P_GPR_1 */
    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true);
    /* assign index in virtual register to P_GPR_2 */
    get_virtual_reg(mir->dalvikInsn.vB, OpndSize_32, P_GPR_2, true);
    export_pc();
    compare_imm_reg(OpndSize_32, 0, P_GPR_1, true);
    condJumpToBasicBlock(stream, Condition_E, cUnit->exceptionBlockId);

    if (maxC < 0) {
        //if P_GPR_2 is mapped to a VR, we can't do this
        alu_binary_imm_reg(OpndSize_32, sub_opc, -maxC, P_GPR_2, true);
    } else if(maxC > 0) {
        alu_binary_imm_reg(OpndSize_32, add_opc, maxC, P_GPR_2, true);
    }
    compare_mem_reg(OpndSize_32, offArrayObject_length, P_GPR_1, true, P_GPR_2, true);
    condJumpToBasicBlock(stream, Condition_NC, cUnit->exceptionBlockId);

}
#undef P_GPR_1
#undef P_GPR_2

/*
 * vA = idxReg;
 * vB = minC;
 */
#define P_GPR_1 PhysicalReg_ECX
static void genHoistedLowerBoundCheck(CompilationUnit *cUnit, MIR *mir)
{
    DecodedInstruction *dInsn = &mir->dalvikInsn;
    const int minC = dInsn->vB;
    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true); //array
    export_pc();
    compare_imm_reg(OpndSize_32, -minC, P_GPR_1, true);
    condJumpToBasicBlock(stream, Condition_C, cUnit->exceptionBlockId);
}
#undef P_GPR_1

#ifdef WITH_JIT_INLINING
static void genValidationForPredictedInline(CompilationUnit *cUnit, MIR *mir)
{
    CallsiteInfo *callsiteInfo = mir->meta.callsiteInfo;
    if(gDvm.executionMode == kExecutionModeNcgO0) {
        get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, PhysicalReg_EBX, true);
        move_imm_to_reg(OpndSize_32, (int) callsiteInfo->clazz, PhysicalReg_ECX, true);
        compare_imm_reg(OpndSize_32, 0, PhysicalReg_EBX, true);
        export_pc(); //use %edx
        conditional_jump_global_API(, Condition_E, "common_errNullObject", false);
        move_mem_to_reg(OpndSize_32, offObject_clazz, PhysicalReg_EBX, true, PhysicalReg_EAX, true);
        compare_reg_reg(PhysicalReg_ECX, true, PhysicalReg_EAX, true);
    } else {
        get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, 5, false);
        move_imm_to_reg(OpndSize_32, (int) callsiteInfo->clazz, 4, false);
        nullCheck(5, false, 1, mir->dalvikInsn.vC);
        move_mem_to_reg(OpndSize_32, offObject_clazz, 5, false, 6, false);
        compare_reg_reg(4, false, 6, false);
    }

    //immdiate will be updated later in genLandingPadForMispredictedCallee
    streamMisPred = stream;
    callsiteInfo->misPredBranchOver = (LIR*)conditional_jump_int(Condition_NE, 0, OpndSize_8);
}
#endif

/* Extended MIR instructions like PHI */
void handleExtendedMIR(CompilationUnit *cUnit, MIR *mir)
{
    ExecutionMode origMode = gDvm.executionMode;
    gDvm.executionMode = kExecutionModeNcgO0;
    switch ((ExtendedMIROpcode)mir->dalvikInsn.opcode) {
        case kMirOpPhi: {
            break;
        }
        case kMirOpNullNRangeUpCheck: {
            genHoistedChecksForCountUpLoop(cUnit, mir);
            break;
        }
        case kMirOpNullNRangeDownCheck: {
            genHoistedChecksForCountDownLoop(cUnit, mir);
            break;
        }
        case kMirOpLowerBound: {
            genHoistedLowerBoundCheck(cUnit, mir);
            break;
        }
        case kMirOpPunt: {
            break;
        }
#ifdef WITH_JIT_INLINING
        case kMirOpCheckInlinePrediction: { //handled in ncg_o1_data.c
            genValidationForPredictedInline(cUnit, mir);
            break;
        }
#endif
        default:
            break;
    }
    gDvm.executionMode = origMode;
}

static void setupLoopEntryBlock(CompilationUnit *cUnit, BasicBlock *entry,
                                int bodyId)
{
    /*
     * Next, create two branches - one branch over to the loop body and the
     * other branch to the PCR cell to punt.
     */
    //LowOp* branchToBody = jumpToBasicBlock(stream, bodyId);
    //setupResourceMasks(branchToBody);
    //cUnit->loopAnalysis->branchToBody = ((LIR*)branchToBody);

#if 0
    LowOp *branchToPCR = dvmCompilerNew(sizeof(ArmLIR), true);
    branchToPCR->opCode = kThumbBUncond;
    branchToPCR->generic.target = (LIR *) pcrLabel;
    setupResourceMasks(branchToPCR);
    cUnit->loopAnalysis->branchToPCR = (LIR *) branchToPCR;
#endif
}

/* check whether we can merge the block at index i with its target block */
bool mergeBlock(BasicBlock *bb) {
    if(bb->blockType == kDalvikByteCode &&
       bb->firstMIRInsn != NULL &&
       (bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO_16 ||
        bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO ||
        bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO_32) &&
       bb->fallThrough == NULL) {// &&
       //cUnit->hasLoop) {
        //ALOGI("merge blocks ending with goto at index %d", i);
        MIR* prevInsn = bb->lastMIRInsn->prev;
        if(bb->taken == NULL) return false;
        MIR* mergeInsn = bb->taken->firstMIRInsn;
        if(mergeInsn == NULL) return false;
        if(prevInsn == NULL) {//the block has a single instruction
            bb->firstMIRInsn = mergeInsn;
        } else {
            prevInsn->next = mergeInsn; //remove goto from the chain
        }
        mergeInsn->prev = prevInsn;
        bb->lastMIRInsn = bb->taken->lastMIRInsn;
        bb->taken->firstMIRInsn = NULL; //block being merged in
        bb->fallThrough = bb->taken->fallThrough;
        bb->taken = bb->taken->taken;
        return true;
    }
    return false;
}

static int genTraceProfileEntry(CompilationUnit *cUnit)
{
    cUnit->headerSize = 6;
    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
        return 12;
    } else {
        return 4;
    }

}

#define PRINT_BUFFER_LEN 1024
/* Print the code block in code cache in the range of [startAddr, endAddr)
 * in readable format.
 */
void printEmittedCodeBlock(unsigned char *startAddr, unsigned char *endAddr)
{
    char strbuf[PRINT_BUFFER_LEN];
    unsigned char *addr;
    unsigned char *next_addr;
    int n;

    if (gDvmJit.printBinary) {
        // print binary in bytes
        n = 0;
        for (addr = startAddr; addr < endAddr; addr++) {
            n += snprintf(&strbuf[n], PRINT_BUFFER_LEN-n, "0x%x, ", *addr);
            if (n > PRINT_BUFFER_LEN - 10) {
                ALOGD("## %s", strbuf);
                n = 0;
            }
        }
        if (n > 0)
            ALOGD("## %s", strbuf);
    }

    // print disassembled instructions
    addr = startAddr;
    while (addr < endAddr) {
        next_addr = reinterpret_cast<unsigned char*>
            (decoder_disassemble_instr(reinterpret_cast<char*>(addr),
                                       strbuf, PRINT_BUFFER_LEN));
        if (addr != next_addr) {
            ALOGD("**  %p: %s", addr, strbuf);
        } else {                // check whether this is nop padding
            if (addr[0] == 0x90) {
                ALOGD("**  %p: NOP (1 byte)", addr);
                next_addr += 1;
            } else if (addr[0] == 0x66 && addr[1] == 0x90) {
                ALOGD("**  %p: NOP (2 bytes)", addr);
                next_addr += 2;
            } else if (addr[0] == 0x0f && addr[1] == 0x1f && addr[2] == 0x00) {
                ALOGD("**  %p: NOP (3 bytes)", addr);
                next_addr += 3;
            } else {
                ALOGD("** unable to decode binary at %p", addr);
                break;
            }
        }
        addr = next_addr;
    }
}

/* 4 is the number of additional bytes needed for chaining information for trace:
 * 2 bytes for chaining cell count offset and 2 bytes for chaining cell offset */
#define EXTRA_BYTES_FOR_CHAINING 4

/* Entry function to invoke the backend of the JIT compiler */
void dvmCompilerMIR2LIR(CompilationUnit *cUnit, JitTranslationInfo *info)
{
    dump_x86_inst = cUnit->printMe;
    /* Used to hold the labels of each block */
    LowOpBlockLabel *labelList =
        (LowOpBlockLabel *)dvmCompilerNew(sizeof(LowOpBlockLabel) * cUnit->numBlocks, true); //Utility.c
    LowOp *headLIR = NULL;
    GrowableList chainingListByType[kChainingCellLast];
    unsigned int i, padding;

    /*
     * Initialize various types chaining lists.
     */
    for (i = 0; i < kChainingCellLast; i++) {
        dvmInitGrowableList(&chainingListByType[i], 2);
    }

    /* Clear the visited flag for each block */
    dvmCompilerDataFlowAnalysisDispatcher(cUnit, dvmCompilerClearVisitedFlag,
                                          kAllNodes, false /* isIterative */);

    GrowableListIterator iterator;
    dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);

    /* Traces start with a profiling entry point.  Generate it here */
    cUnit->profileCodeSize = genTraceProfileEntry(cUnit);

    //BasicBlock **blockList = cUnit->blockList;
    GrowableList *blockList = &cUnit->blockList;
    BasicBlock *bb;

    info->codeAddress = NULL;
    stream = (char*)gDvmJit.codeCache + gDvmJit.codeCacheByteUsed;

    // TODO: compile into a temporary buffer and then copy into the code cache.
    // That would let us leave the code cache unprotected for a shorter time.
    size_t unprotected_code_cache_bytes =
            gDvmJit.codeCacheSize - gDvmJit.codeCacheByteUsed - CODE_CACHE_PADDING;
    UNPROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);

    streamStart = stream; /* trace start before alignment */
    stream += EXTRA_BYTES_FOR_CHAINING; /* This is needed for chaining. Add the bytes before the alignment */
    stream = (char*)(((unsigned int)stream + 0xF) & ~0xF); /* Align trace to 16-bytes */
    streamMethodStart = stream; /* code start */
    for (i = 0; i < ((unsigned int) cUnit->numBlocks); i++) {
        labelList[i].lop.generic.offset = -1;
    }
    cUnit->exceptionBlockId = -1;
    for (i = 0; i < blockList->numUsed; i++) {
        bb = (BasicBlock *) blockList->elemList[i];
        if(bb->blockType == kExceptionHandling)
            cUnit->exceptionBlockId = i;
    }
    startOfTrace(cUnit->method, labelList, cUnit->exceptionBlockId, cUnit);
    if(gDvm.executionMode == kExecutionModeNcgO1) {
        //merge blocks ending with "goto" with the fall through block
        if (cUnit->jitMode != kJitLoop)
            for (i = 0; i < blockList->numUsed; i++) {
                bb = (BasicBlock *) blockList->elemList[i];
                bool merged = mergeBlock(bb);
                while(merged) merged = mergeBlock(bb);
            }
        for (i = 0; i < blockList->numUsed; i++) {
            bb = (BasicBlock *) blockList->elemList[i];
            if(bb->blockType == kDalvikByteCode &&
               bb->firstMIRInsn != NULL) {
                preprocessingBB(bb);
            }
        }
        preprocessingTrace();
    }

    /* Handle the content in each basic block */
    for (i = 0; ; i++) {
        MIR *mir;
        bb = (BasicBlock *) dvmGrowableListIteratorNext(&iterator);
        if (bb == NULL) break;
        if (bb->visited == true) continue;

        labelList[i].immOpnd.value = bb->startOffset;

        if (bb->blockType >= kChainingCellLast) {
            /*
             * Append the label pseudo LIR first. Chaining cells will be handled
             * separately afterwards.
             */
            dvmCompilerAppendLIR(cUnit, (LIR *) &labelList[i]);
        }

        if (bb->blockType == kEntryBlock) {
            labelList[i].lop.opCode2 = ATOM_PSEUDO_ENTRY_BLOCK;
            if (bb->firstMIRInsn == NULL) {
                continue;
            } else {
              setupLoopEntryBlock(cUnit, bb, bb->fallThrough->id);
                                  //&labelList[blockList[i]->fallThrough->id]);
            }
        } else if (bb->blockType == kExitBlock) {
            labelList[i].lop.opCode2 = ATOM_PSEUDO_EXIT_BLOCK;
            labelList[i].lop.generic.offset = (stream - streamMethodStart);
            goto gen_fallthrough;
        } else if (bb->blockType == kDalvikByteCode) {
            if (bb->hidden == true) continue;
            labelList[i].lop.opCode2 = ATOM_PSEUDO_NORMAL_BLOCK_LABEL;
            /* Reset the register state */
#if 0
            resetRegisterScoreboard(cUnit);
#endif
        } else {
            switch (bb->blockType) {
                case kChainingCellNormal:
                    labelList[i].lop.opCode2 = ATOM_PSEUDO_CHAINING_CELL_NORMAL;
                    /* handle the codegen later */
                    dvmInsertGrowableList(
                        &chainingListByType[kChainingCellNormal], i);
                    break;
                case kChainingCellInvokeSingleton:
                    labelList[i].lop.opCode2 =
                        ATOM_PSEUDO_CHAINING_CELL_INVOKE_SINGLETON;
                    labelList[i].immOpnd.value =
                        (int) bb->containingMethod;
                    /* handle the codegen later */
                    dvmInsertGrowableList(
                        &chainingListByType[kChainingCellInvokeSingleton], i);
                    break;
                case kChainingCellInvokePredicted:
                    labelList[i].lop.opCode2 =
                        ATOM_PSEUDO_CHAINING_CELL_INVOKE_PREDICTED;
                   /*
                     * Move the cached method pointer from operand 1 to 0.
                     * Operand 0 was clobbered earlier in this routine to store
                     * the block starting offset, which is not applicable to
                     * predicted chaining cell.
                     */
                    //TODO
                    //labelList[i].operands[0] = labelList[i].operands[1];

                    /* handle the codegen later */
                    dvmInsertGrowableList(
                        &chainingListByType[kChainingCellInvokePredicted], i);
                    break;
                case kChainingCellHot:
                    labelList[i].lop.opCode2 =
                        ATOM_PSEUDO_CHAINING_CELL_HOT;
                    /* handle the codegen later */
                    dvmInsertGrowableList(
                        &chainingListByType[kChainingCellHot], i);
                    break;
                case kPCReconstruction:
                    /* Make sure exception handling block is next */
                    labelList[i].lop.opCode2 =
                        ATOM_PSEUDO_PC_RECONSTRUCTION_BLOCK_LABEL;
                    //assert (i == cUnit->numBlocks - 2);
                    labelList[i].lop.generic.offset = (stream - streamMethodStart);
                    handlePCReconstruction(cUnit,
                                           &labelList[cUnit->puntBlock->id]);
                    break;
                case kExceptionHandling:
                    labelList[i].lop.opCode2 = ATOM_PSEUDO_EH_BLOCK_LABEL;
                    labelList[i].lop.generic.offset = (stream - streamMethodStart);
                    //if (cUnit->pcReconstructionList.numUsed) {
                        scratchRegs[0] = PhysicalReg_EAX;
                        jumpToInterpPunt();
                        //call_dvmJitToInterpPunt();
                    //}
                    break;
                case kChainingCellBackwardBranch:
                    labelList[i].lop.opCode2 = ATOM_PSEUDO_CHAINING_CELL_BACKWARD_BRANCH;
                    /* handle the codegen later */
                    dvmInsertGrowableList(
                        &chainingListByType[kChainingCellBackwardBranch],
                        i);
                    break;
                default:
                    break;
            }
            continue;
        }
        {
        //LowOp *headLIR = NULL;
        const DexCode *dexCode = dvmGetMethodCode(cUnit->method);
        const u2 *startCodePtr = dexCode->insns;
        const u2 *codePtr;
        labelList[i].lop.generic.offset = (stream - streamMethodStart);
        ALOGV("get ready to handle JIT bb %d type %d hidden %d",
              bb->id, bb->blockType, bb->hidden);
        for (BasicBlock *nextBB = bb; nextBB != NULL; nextBB = cUnit->nextCodegenBlock) {
            bb = nextBB;
            bb->visited = true;
            cUnit->nextCodegenBlock = NULL;

        if(gDvm.executionMode == kExecutionModeNcgO1 &&
           bb->blockType != kEntryBlock &&
           bb->firstMIRInsn != NULL) {
            startOfBasicBlock(bb);
            int cg_ret = codeGenBasicBlockJit(cUnit->method, bb);
            endOfBasicBlock(bb);
            if(cg_ret < 0) {
                endOfTrace(true/*freeOnly*/);
                cUnit->baseAddr = NULL;
                PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
                return;
            }
        } else {
        for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
            startOfBasicBlock(bb); //why here for O0
            Opcode dalvikOpCode = mir->dalvikInsn.opcode;
            if((int)dalvikOpCode >= (int)kMirOpFirst) {
                handleExtendedMIR(cUnit, mir);
                continue;
            }
            InstructionFormat dalvikFormat =
                dexGetFormatFromOpcode(dalvikOpCode);
            ALOGV("ready to handle bytecode at offset %x: opcode %d format %d",
                  mir->offset, dalvikOpCode, dalvikFormat);
            LowOpImm *boundaryLIR = dump_special(ATOM_PSEUDO_DALVIK_BYTECODE_BOUNDARY, mir->offset);
            /* Remember the first LIR for this block */
            if (headLIR == NULL) {
                headLIR = (LowOp*)boundaryLIR;
            }
            bool notHandled = true;
            /*
             * Debugging: screen the opcode first to see if it is in the
             * do[-not]-compile list
             */
            bool singleStepMe =
                gDvmJit.includeSelectedOp !=
                ((gDvmJit.opList[dalvikOpCode >> 3] &
                  (1 << (dalvikOpCode & 0x7))) !=
                 0);
            if (singleStepMe || cUnit->allSingleStep) {
            } else {
                codePtr = startCodePtr + mir->offset;
                //lower each byte code, update LIR
                notHandled = lowerByteCodeJit(cUnit->method, cUnit->method->insns+mir->offset, mir);
                if(gDvmJit.codeCacheByteUsed + (stream - streamStart) +
                   CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
                    ALOGI("JIT code cache full after lowerByteCodeJit (trace uses %uB)", (stream - streamStart));
                    gDvmJit.codeCacheFull = true;
                    cUnit->baseAddr = NULL;
                    endOfTrace(true/*freeOnly*/);
                    PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
                    return;
                }
            }
            if (notHandled) {
                ALOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled",
                     mir->offset,
                     dalvikOpCode, dexGetOpcodeName(dalvikOpCode),
                     dalvikFormat);
                dvmAbort();
                break;
            }
        } // end for
        } // end else //JIT + O0 code generator
        }
        } // end for
        /* Eliminate redundant loads/stores and delay stores into later slots */
#if 0
        dvmCompilerApplyLocalOptimizations(cUnit, (LIR *) headLIR,
                                           cUnit->lastLIRInsn);
#endif
        if (headLIR) headLIR = NULL;
gen_fallthrough:
        /*
         * Check if the block is terminated due to trace length constraint -
         * insert an unconditional branch to the chaining cell.
         */
        if (bb->needFallThroughBranch) {
            jumpToBasicBlock(stream, bb->fallThrough->id);
        }

    }

    char* streamChainingStart = (char*)stream;
    /* Handle the chaining cells in predefined order */
    for (i = 0; i < kChainingCellGap; i++) {
        size_t j;
        int *blockIdList = (int *) chainingListByType[i].elemList;

        cUnit->numChainingCells[i] = chainingListByType[i].numUsed;

        /* No chaining cells of this type */
        if (cUnit->numChainingCells[i] == 0)
            continue;

        /* Record the first LIR for a new type of chaining cell */
        cUnit->firstChainingLIR[i] = (LIR *) &labelList[blockIdList[0]];
        for (j = 0; j < chainingListByType[i].numUsed; j++) {
            int blockId = blockIdList[j];
            BasicBlock *chainingBlock =
                (BasicBlock *) dvmGrowableListGetElement(&cUnit->blockList,
                                                         blockId);

            labelList[blockId].lop.generic.offset = (stream - streamMethodStart);

            /* Align this chaining cell first */
#if 0
            newLIR0(cUnit, ATOM_PSEUDO_ALIGN4);
#endif
            /* Insert the pseudo chaining instruction */
            dvmCompilerAppendLIR(cUnit, (LIR *) &labelList[blockId]);


            switch (chainingBlock->blockType) {
                case kChainingCellNormal:
                    handleNormalChainingCell(cUnit,
                     chainingBlock->startOffset, blockId, labelList);
                    break;
                case kChainingCellInvokeSingleton:
                    handleInvokeSingletonChainingCell(cUnit,
                        chainingBlock->containingMethod, blockId, labelList);
                    break;
                case kChainingCellInvokePredicted:
                    handleInvokePredictedChainingCell(cUnit, blockId);
                    break;
                case kChainingCellHot:
                    handleHotChainingCell(cUnit,
                        chainingBlock->startOffset, blockId, labelList);
                    break;
                case kChainingCellBackwardBranch:
                    handleBackwardBranchChainingCell(cUnit,
                        chainingBlock->startOffset, blockId, labelList);
                    break;
                default:
                    ALOGE("Bad blocktype %d", chainingBlock->blockType);
                    dvmAbort();
                    break;
            }

            if (gDvmJit.codeCacheByteUsed + (stream - streamStart) + CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
                ALOGI("JIT code cache full after ChainingCell (trace uses %uB)", (stream - streamStart));
                gDvmJit.codeCacheFull = true;
                cUnit->baseAddr = NULL;
                endOfTrace(true); /* need to free structures */
                PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
                return;
            }
        }
    }
#if 0
    dvmCompilerApplyGlobalOptimizations(cUnit);
#endif
    endOfTrace(false);

    if (gDvmJit.codeCacheFull) {
        /* We hit code cache size limit inside endofTrace(false).
         * Bail out for this trace!
         */
        ALOGI("JIT code cache full after endOfTrace (trace uses %uB)", (stream - streamStart));
        cUnit->baseAddr = NULL;
        PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
        return;
    }

    /* dump section for chaining cell counts, make sure it is 4-byte aligned */
    padding = (4 - ((u4)stream & 3)) & 3;
    stream += padding;
    ChainCellCounts chainCellCounts;
    /* Install the chaining cell counts */
    for (i=0; i< kChainingCellGap; i++) {
        chainCellCounts.u.count[i] = cUnit->numChainingCells[i];
    }
    char* streamCountStart = (char*)stream;
    memcpy((char*)stream, &chainCellCounts, sizeof(chainCellCounts));
    stream += sizeof(chainCellCounts);

    cUnit->baseAddr = streamMethodStart;
    cUnit->totalSize = (stream - streamStart);
    if(gDvmJit.codeCacheByteUsed + cUnit->totalSize + CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
        ALOGI("JIT code cache full after ChainingCellCounts (trace uses %uB)", (stream - streamStart));
        gDvmJit.codeCacheFull = true;
        cUnit->baseAddr = NULL;
        PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
        return;
    }

    /* write chaining cell count offset & chaining cell offset */
    u2* pOffset = (u2*)(streamMethodStart - EXTRA_BYTES_FOR_CHAINING); /* space was already allocated for this purpose */
    *pOffset = streamCountStart - streamMethodStart; /* from codeAddr */
    pOffset[1] = streamChainingStart - streamMethodStart;

    PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);

    gDvmJit.codeCacheByteUsed += (stream - streamStart);
    if (cUnit->printMe) {
        unsigned char* codeBaseAddr = (unsigned char *) cUnit->baseAddr;
        unsigned char* codeBaseAddrNext = ((unsigned char *) gDvmJit.codeCache) + gDvmJit.codeCacheByteUsed;
        ALOGD("-------- Built trace for %s%s, JIT code [%p, %p) cache start %p",
              cUnit->method->clazz->descriptor, cUnit->method->name,
              codeBaseAddr, codeBaseAddrNext, gDvmJit.codeCache);
        ALOGD("** %s%s@0x%x:", cUnit->method->clazz->descriptor,
              cUnit->method->name, cUnit->traceDesc->trace[0].info.frag.startOffset);
        printEmittedCodeBlock(codeBaseAddr, codeBaseAddrNext);
    }
    ALOGV("JIT CODE after trace %p to %p size %x START %p", cUnit->baseAddr,
          (char *) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed,
          cUnit->totalSize, gDvmJit.codeCache);

    gDvmJit.numCompilations++;

    info->codeAddress = (char*)cUnit->baseAddr;// + cUnit->headerSize;
}

/*
 * Perform translation chain operation.
 */
void* dvmJitChain(void* tgtAddr, u4* branchAddr)
{
#ifdef JIT_CHAIN
    int relOffset = (int) tgtAddr - (int)branchAddr;

    if ((gDvmJit.pProfTable != NULL) && (gDvm.sumThreadSuspendCount == 0) &&
        (gDvmJit.codeCacheFull == false)) {

        gDvmJit.translationChains++;

        //OpndSize immSize = estOpndSizeFromImm(relOffset);
        //relOffset -= getJmpCallInstSize(immSize, JmpCall_uncond);
        /* Hard coded the jump opnd size to 32 bits, This instruction will replace the "jump 0" in
         * the original code sequence.
         */
        OpndSize immSize = OpndSize_32;
        relOffset -= 5;
        //can't use stream here since it is used by the compilation thread
        UNPROTECT_CODE_CACHE(branchAddr, sizeof(*branchAddr));
        dump_imm_with_codeaddr(Mnemonic_JMP, immSize, relOffset, (char*)branchAddr); //dump to branchAddr
        PROTECT_CODE_CACHE(branchAddr, sizeof(*branchAddr));

        gDvmJit.hasNewChain = true;

        COMPILER_TRACE_CHAINING(
            ALOGI("Jit Runtime: chaining 0x%x to %p with relOffset %x",
                  (int) branchAddr, tgtAddr, relOffset));
    }
#endif
    return tgtAddr;
}

/*
 * Accept the work and start compiling.  Returns true if compilation
 * is attempted.
 */
bool dvmCompilerDoWork(CompilerWorkOrder *work)
{
    JitTraceDescription *desc;
    bool isCompile;
    bool success = true;

    if (gDvmJit.codeCacheFull) {
        return false;
    }

    switch (work->kind) {
        case kWorkOrderTrace:
            isCompile = true;
            /* Start compilation with maximally allowed trace length */
            desc = (JitTraceDescription *)work->info;
            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
                                        work->bailPtr, 0 /* no hints */);
            break;
        case kWorkOrderTraceDebug: {
            bool oldPrintMe = gDvmJit.printMe;
            gDvmJit.printMe = true;
            isCompile = true;
            /* Start compilation with maximally allowed trace length */
            desc = (JitTraceDescription *)work->info;
            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
                                        work->bailPtr, 0 /* no hints */);
            gDvmJit.printMe = oldPrintMe;
            break;
        }
        case kWorkOrderProfileMode:
            dvmJitChangeProfileMode((TraceProfilingModes)(int)work->info);
            isCompile = false;
            break;
        default:
            isCompile = false;
            ALOGE("Jit: unknown work order type");
            assert(0);  // Bail if debug build, discard otherwise
    }
    if (!success)
        work->result.codeAddress = NULL;
    return isCompile;
}

void dvmCompilerCacheFlush(long start, long end, long flags) {
  /* cacheflush is needed for ARM, but not for IA32 (coherent icache) */
}

//#endif