MATLAB: Cannot utilize fully all GPUs during network training

deep learninggpuGPU Codermachine learningParallel Computing Toolbox

It’s the performance and use of the resources installed on the Computer (Amazon Cloud EC2 instance in our case).

I am using a p3.8xlarge instance in EC2 on awamzon web server – basically it means I am using 4 GPUs V100,

I am training a neural network.

using:

mdl(i).Net = trainNetwork(trainData(:, :, :, 1: itStep: end), trainLabels(1: itStep: end, :), layers, options);

in options I define 'multi-gpu'

I also defined 'parallel' and tried to play with number of workers but all I see is just more processes waiting in the GPU queue on nvidia-smi.

For some reason I see that all GPU are working (see GPU.png) but for limited amount of time (very high usage for 3 seconds and then drops to 0% for 10 seconds at least.

I looked at the htop information(htop.jpg), I see that not all threads of the CPU are in use so that is the bottleneck (I think?)

I have a xeon processor on this instance with 32 cores (16 physical, 32 logical)

When I try to utilize all threads through local profile (profile local pool.png) it seems like it still doesn’t respond .

I get more workers because of it (CPU ?), but the GPUs still doesn't seem to improve

Tried to increase batch size – but at some point the GPU is out of memory, so that's not the problem.

How do i utilize all cores of the CPU to transfer data to the GPUs?

I read somewhere that you can also load the data to the pool itself? will that help?

I use the https://ngc.nvidia.com/catalog/containers/partners:matlab/tagsmatlab container for matlab:r2019a

I scanned these already:

https://www.mathworks.com/help/deeplearning/ug/scale-up-deep-learning-with-multiple-gpus.html

https://www.mathworks.com/help/deeplearning/ug/deep-learning-with-matlab-on-multiple-gpus.html

https://www.mathworks.com/help/deeplearning/examples/train-network-using-automatic-multi-gpu-support.html

Would appreciate your help.

Tomer

Best Answer

Ok, problem solved.

as Suggested by Joss and Mathworks, MathWorks Support created a custom Datastore inheriting the properties involved in this procedure,

The Labels are a numeric vector as an input to the DS function but could but could be loaded from a .mat file as well.

for Training, apprently I needed to use the 'parallel' execution environment and define DispatchInBackground training option as 'true' (probably since I use AWS cloud service).

classdef matFilesDatastore < matlab.io.Datastore & ...
        matlab.io.datastore.Shuffleable & ...
        matlab.io.datastore.Partitionable
    
    properties
        Datastore
        Labels
        ReadSize
    end
    
    properties(SetAccess = protected)
        NumObservations
    end
    
    properties(Access = private)
        CurrentFileIndex
    end
    
    methods
        function ds = matFilesDatastore(folder, labels)
            % ds = matFilesDatastore(folder, labels) creates a datastore
            % from the data in folder and labels
            
            % Create file datastore
            fds = fileDatastore(folder, ...
                'ReadFcn',@readData, ...
                'IncludeSubfolders',true);
            ds.Datastore = fds;
            
            numObservations = numel(fds.Files);
            
            % Labels.
            ds.Labels = labels;
            
            % Initialize datastore properties.
            ds.ReadSize = 1;
            ds.NumObservations = numObservations;
            ds.CurrentFileIndex = 1;
        end
        
        function tf = hasdata(ds)
            % tf = hasdata(ds) returns true if more data is available.
            
            tf = ds.CurrentFileIndex + ds.ReadSize - 1 ...
                <= ds.NumObservations;
        end
        
        function [data,info] = read(ds)
            % [data,info] = read(ds) read one mini-batch of data.
            
            miniBatchSize = ds.ReadSize;
            info = struct;
            
            for i = 1:miniBatchSize
                predictors{i,1} = read(ds.Datastore);
                responses{i,1} = ds.Labels(ds.CurrentFileIndex);
                ds.CurrentFileIndex = ds.CurrentFileIndex + 1;
            end
            
            data = table(predictors,responses);
        end
        
        function reset(ds)
            % reset(ds) resets the datastore to the start of the data.          
            reset(ds.Datastore);
            ds.CurrentFileIndex = 1;
        end
        
        function dsNew = shuffle(ds)
            % dsNew = shuffle(ds) shuffles the files and the corresponding
            % labels in the datastore.
            
            % Create copy of datastore.
            dsNew = copy(ds);
            dsNew.Datastore = copy(ds.Datastore);
            fds = dsNew.Datastore;
            
            % Shuffle files and corresponding labels.
            numObservations = dsNew.NumObservations;
            idx = randperm(numObservations);
            fds.Files = fds.Files(idx);
            dsNew.Labels = dsNew.Labels(idx);
        end
        
        function subds = partition(ds, numPartitions, idx)
            subds = copy(ds);            
            subds.Datastore = partition(ds.Datastore, numPartitions, idx);
            subds.NumObservations = numel(subds.Datastore.Files);
            indices = pigeonHole(idx, numPartitions, ds.NumObservations);
            subds.Labels = ds.Labels(indices); 
            reset(subds);
        end
    end
    
    methods(Access = protected)
        function n = maxpartitions(ds)
            n = ds.NumObservations;
        end
    end
    
    methods (Hidden = true)
        function frac = progress(ds)
            % frac = progress(ds) returns the percentage of observations
            % read in the datastore.
            
            frac = (ds.CurrentFileIndex - 1) / ds.NumObservations;
        end
    end
end
function data = readData(filename)
% data = readData(filename) reads the data X from the MAT file
% filename
S = load(filename);
data = S.image;
% label = S.label;
end
function observationIndices = pigeonHole(partitionIndex, numPartitions, numObservations)
%pigeonHole   Helper function that maps partition index and numpartitions
%   to the corresponding observation indices.
    observationIndices = floor((0:numObservations - 1) * numPartitions / numObservations) + 1;    
    observationIndices = find(observationIndices == partitionIndex);
    % Convert to a vector if observationIndices is empty.
    if isempty(observationIndices)
        observationIndices = double.empty(0, 1);
    end
end

Best Answer

Related Solutions

MATLAB: Am I getting an error saying the Datastore is not shuffable

MATLAB: Video Labelling for Video Classification (LSTM)

Related Question