/*
 * Decompiled with CFR 0.152.
 */
package com.nvidia.viper.analysis;

import com.nvidia.viper.ViperException;
import com.nvidia.viper.ViperExceptionHandler;
import com.nvidia.viper.ViperMessages;
import com.nvidia.viper.activity.CuDeviceAttribute;
import com.nvidia.viper.jni.CuCacheConfig;
import com.nvidia.viper.jni.CuException;
import com.nvidia.viper.jni.CudaOccupancyDeviceProp;
import com.nvidia.viper.jni.CudaOccupancyDeviceState;
import com.nvidia.viper.jni.CudaOccupancyFuncAttribute;
import com.nvidia.viper.jni.CudaOccupancyResult;
import com.nvidia.viper.jni.NativeCuda;
import com.nvidia.viper.model.ComputeLimits;
import com.nvidia.viper.model.TimelineDevice;
import com.nvidia.viper.model.TimelineIntervalKernel;
import java.util.HashMap;
import java.util.Map;

public class OccupancyCalculator {
    static final double GRID_SIZE_LIMITED_THRESHOLD = 80.0;
    private int warpsPerBlock;
    private boolean isGridSizeLimited;
    private final TimelineIntervalKernel kernel;
    private final TimelineDevice deviceTimeline;
    private CudaOccupancyResult occupancyResult;
    private static Map<Integer, Integer> deviceMaxSharedMemoryPerSM = new HashMap<Integer, Integer>();
    private static Map<Integer, Integer> deviceMaxThreadsPerSM = new HashMap<Integer, Integer>();
    private static Map<Integer, Integer> deviceMaxRegistersPerSM = new HashMap<Integer, Integer>();

    public OccupancyCalculator(TimelineIntervalKernel kernel, TimelineDevice deviceTimeline, boolean showError) {
        this(kernel, deviceTimeline, kernel.getThreadsPerBlock(), kernel.getRegistersPerThread(), kernel.getDynamicSharedMemory(), kernel.getStaticSharedMemory(), kernel.getCudaOccPartitionedGCConfig(), kernel.getCudaOccSharedMemoryCarveoutSize(), showError);
    }

    public OccupancyCalculator(TimelineIntervalKernel kernel, TimelineDevice deviceTimeline) {
        this(kernel, deviceTimeline, kernel.getThreadsPerBlock(), kernel.getRegistersPerThread(), kernel.getDynamicSharedMemory(), kernel.getStaticSharedMemory(), kernel.getCudaOccPartitionedGCConfig(), kernel.getCudaOccSharedMemoryCarveoutSize(), true);
    }

    public OccupancyCalculator(TimelineIntervalKernel kernel, TimelineDevice deviceTimeline, Integer threadsPerBlock, Integer registersPerThread, Integer staticSharedMemoryUsage, Integer dynamicSharedMemoryUsage, Integer partitionedGCConfig, Integer sharedMemoryCarveout, boolean showError) {
        this.kernel = kernel;
        this.deviceTimeline = deviceTimeline;
        int computeMajor = deviceTimeline.getComputeCapabilityMajor();
        int computeMinor = deviceTimeline.getComputeCapabilityMinor();
        this.isGridSizeLimited = false;
        int maxBlockSize = deviceTimeline.getMaxThreadsPerBlock();
        int threadsPerSM = OccupancyCalculator.getMaxThreadsPerMultiprocessor(deviceTimeline);
        int threadsPerWarp = deviceTimeline.getNumThreadsPerWarp();
        int regsPerBlock = deviceTimeline.getMaxRegistersPerBlock();
        int regsPerSM = OccupancyCalculator.getMaxRegistersPerMultiprocessor(deviceTimeline);
        int sharedMemPerBlock = deviceTimeline.getMaxSharedMemoryPerBlock();
        int sharedMemPerSM = OccupancyCalculator.getMaxSharedMemoryPerMultiprocessor(deviceTimeline);
        if (threadsPerBlock != null && registersPerThread != null && staticSharedMemoryUsage != null && dynamicSharedMemoryUsage != null && kernel.getCacheConfigExecuted() != null) {
            long waves;
            Long smemExecuted;
            this.occupancyResult = null;
            this.warpsPerBlock = (int)Math.ceil((double)threadsPerBlock.intValue() / (double)threadsPerWarp);
            CudaOccupancyDeviceProp occupancyDeviceProp = new CudaOccupancyDeviceProp(computeMajor, computeMinor, maxBlockSize, threadsPerSM, regsPerBlock, regsPerSM, threadsPerWarp, sharedMemPerBlock, sharedMemPerSM, sharedMemPerSM, deviceTimeline.getNumMultiprocessors());
            CuCacheConfig ccExecuted = kernel.getCacheConfigExecuted();
            Long l = smemExecuted = ccExecuted == null ? null : deviceTimeline.getMaxSharedMemory(ccExecuted);
            if (smemExecuted == null) {
                return;
            }
            CudaOccupancyFuncAttribute occupancyFuncAttributes = new CudaOccupancyFuncAttribute(threadsPerBlock, registersPerThread, staticSharedMemoryUsage, partitionedGCConfig, ccExecuted.getCode(), smemExecuted.intValue());
            CudaOccupancyDeviceState occupancyDeviceState = new CudaOccupancyDeviceState(kernel.getCacheConfigExecuted().getCode(), sharedMemoryCarveout);
            try {
                this.occupancyResult = NativeCuda.cudaOccupancyMaxActiveBlocksPerMultiprocessor(occupancyDeviceProp, occupancyFuncAttributes, threadsPerBlock, dynamicSharedMemoryUsage, occupancyDeviceState);
            }
            catch (CuException exception) {
                if (showError) {
                    ViperExceptionHandler.handle(ViperMessages.Occupancy_Fail_Title, ViperMessages.Occupancy_Fail_Message, exception);
                }
                return;
            }
            int activeBlocksPerDevice = this.occupancyResult.getActiveBlocksPerSM() * deviceTimeline.getNumMultiprocessors();
            if (activeBlocksPerDevice > 0 && (waves = (kernel.getBlocksPerGrid() + (long)activeBlocksPerDevice - 1L) / (long)activeBlocksPerDevice) > 0L) {
                double deviceOccupancy = 100.0 * (double)kernel.getBlocksPerGrid() / (double)(waves * (long)activeBlocksPerDevice);
                this.isGridSizeLimited = deviceOccupancy < 80.0 && deviceOccupancy < this.occupancyResult.getOccupancy() * 100.0;
            }
        }
    }

    public Double getTheoreticOccupancy() {
        return this.occupancyResult == null ? null : Double.valueOf(this.occupancyResult.getOccupancy());
    }

    public Limiter getLimiter() {
        if (this.occupancyResult == null) {
            return null;
        }
        if (this.occupancyResult.getWarpsPerSM() == this.deviceTimeline.getMaxWarpsPerMultiprocessor()) {
            return Limiter.NONE;
        }
        int limitingfactors = this.occupancyResult.getLimitingFactors();
        if (Limiter.REGISTER.isSet(limitingfactors)) {
            return Limiter.REGISTER;
        }
        if (Limiter.SHARED_MEMORY.isSet(limitingfactors)) {
            return Limiter.SHARED_MEMORY;
        }
        if (Limiter.BLOCK.isSet(limitingfactors)) {
            return Limiter.BLOCK;
        }
        if (this.occupancyResult.getBlockLimitWarps() > this.deviceTimeline.getMaxBlocksPerMultiprocessor()) {
            return Limiter.BLOCK;
        }
        return Limiter.NONE;
    }

    public Integer getBlocksPerSM() {
        return this.occupancyResult == null ? null : Integer.valueOf(this.occupancyResult.getActiveBlocksPerSM());
    }

    public Integer getWarpsPerSM() {
        return this.occupancyResult == null ? null : Integer.valueOf(this.occupancyResult.getWarpsPerSM());
    }

    public boolean isGridSizeLimited() {
        return this.isGridSizeLimited;
    }

    public int getBlockLimit() {
        return this.occupancyResult == null ? 0 : this.occupancyResult.getBlockLimitWarps();
    }

    public int getRegisterLimit() {
        return this.occupancyResult == null ? 0 : this.occupancyResult.getBlockLimitRegisters();
    }

    public int getSharedMemoryLimit() {
        return this.occupancyResult == null ? 0 : this.occupancyResult.getBlockLimitSharedMem();
    }

    public int getWarpsPerBlock() {
        return this.warpsPerBlock;
    }

    public int getRegistersPerBlock() {
        return this.occupancyResult == null ? 0 : this.occupancyResult.getAllocatedRegsPerBlock();
    }

    private static Integer getAttribute(Map<Integer, Integer> container, int major, int minor) {
        int computeCapability = major * 10 + minor;
        return container.get(computeCapability);
    }

    private static void setAttribute(Map<Integer, Integer> container, int major, int minor, int value) {
        int computeCapability = major * 10 + minor;
        container.put(computeCapability, value);
    }

    public static int getMaxRegistersPerMultiprocessor(TimelineDevice deviceTimeline) {
        int minor;
        int major = deviceTimeline.getComputeCapabilityMajor();
        Integer maxRegistersPerSM = OccupancyCalculator.getAttribute(deviceMaxRegistersPerSM, major, minor = deviceTimeline.getComputeCapabilityMinor());
        if (maxRegistersPerSM == null) {
            ComputeLimits computeLimits = ComputeLimits.getComputeLimits(major, minor);
            maxRegistersPerSM = computeLimits != null ? Integer.valueOf(computeLimits.regsPerSM) : Integer.valueOf(deviceTimeline.getAttributeValue(CuDeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR).intValue());
            OccupancyCalculator.setAttribute(deviceMaxRegistersPerSM, major, minor, maxRegistersPerSM);
        }
        return maxRegistersPerSM;
    }

    public static int getMaxThreadsPerMultiprocessor(TimelineDevice deviceTimeline) {
        int minor;
        int major = deviceTimeline.getComputeCapabilityMajor();
        Integer maxThreadsPerSM = OccupancyCalculator.getAttribute(deviceMaxThreadsPerSM, major, minor = deviceTimeline.getComputeCapabilityMinor());
        if (maxThreadsPerSM == null) {
            ComputeLimits computeLimits = ComputeLimits.getComputeLimits(major, minor);
            maxThreadsPerSM = computeLimits != null ? Integer.valueOf(computeLimits.threadsPerSM) : Integer.valueOf(deviceTimeline.getAttributeValue(CuDeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR).intValue());
            OccupancyCalculator.setAttribute(deviceMaxThreadsPerSM, major, minor, maxThreadsPerSM);
        }
        return maxThreadsPerSM;
    }

    public static int getMaxSharedMemoryPerMultiprocessor(TimelineDevice deviceTimeline) {
        int minor;
        int major = deviceTimeline.getComputeCapabilityMajor();
        Integer maxSharedMemoryPerSM = OccupancyCalculator.getAttribute(deviceMaxSharedMemoryPerSM, major, minor = deviceTimeline.getComputeCapabilityMinor());
        if (maxSharedMemoryPerSM == null) {
            ComputeLimits computeLimits = ComputeLimits.getComputeLimits(major, minor);
            maxSharedMemoryPerSM = computeLimits != null ? Integer.valueOf(computeLimits.sharedMemPerSM) : Integer.valueOf(deviceTimeline.getAttributeValue(CuDeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR).intValue());
            OccupancyCalculator.setAttribute(deviceMaxSharedMemoryPerSM, major, minor, maxSharedMemoryPerSM);
        }
        return maxSharedMemoryPerSM;
    }

    public Integer getWarpsPerSMForBlockSize(int blockSize) {
        OccupancyCalculator calc = new OccupancyCalculator(this.kernel, this.deviceTimeline, blockSize, this.kernel.getRegistersPerThread(), this.kernel.getStaticSharedMemory(), this.kernel.getDynamicSharedMemory(), this.kernel.getCudaOccPartitionedGCConfig(), this.kernel.getCudaOccSharedMemoryCarveoutSize(), true);
        return calc.getWarpsPerSM();
    }

    public Integer getWarpsPerSMForRegistersPerThread(int registersPerThread) {
        OccupancyCalculator calc = new OccupancyCalculator(this.kernel, this.deviceTimeline, this.kernel.getThreadsPerBlock(), registersPerThread, this.kernel.getStaticSharedMemory(), this.kernel.getDynamicSharedMemory(), this.kernel.getCudaOccPartitionedGCConfig(), this.kernel.getCudaOccSharedMemoryCarveoutSize(), true);
        return calc.getWarpsPerSM();
    }

    public Integer getWarpsPerSMForSharedMemoryUsage(int sharedMemory) throws ViperException {
        OccupancyCalculator calc = new OccupancyCalculator(this.kernel, this.deviceTimeline, this.kernel.getThreadsPerBlock(), this.kernel.getRegistersPerThread(), sharedMemory, 0, this.kernel.getCudaOccPartitionedGCConfig(), this.kernel.getCudaOccSharedMemoryCarveoutSize(), true);
        return calc.getWarpsPerSM();
    }

    public TimelineDevice getDeviceTimeline() {
        return this.deviceTimeline;
    }

    public static enum Limiter {
        NONE(0, "<none>"),
        BLOCK(1, "Block Size"),
        REGISTER(2, "Registers"),
        SHARED_MEMORY(4, "Shared Memory");

        private int code;
        public String label;

        private Limiter(int code, String label) {
            this.code = code;
            this.label = label;
        }

        public String toString() {
            return this.label;
        }

        public boolean isKnownLimit() {
            return !NONE.equals((Object)this);
        }

        private boolean isSet(int limiter) {
            return (this.code & limiter) != 0;
        }
    }
}

