Machine learning in Julia: Digit recognizer

In this blog article I will process the CSV files from the Kaggle “Digit recognizer” competition. First challenge is to wrangle the data from the CSV file into a format that can be used as input for a Flux.jl neural network. The neural network is the Lenet example from the Flux.jl website and it seems to work quite well.

The overall conclusion is that machine learning on image data is fairly straightforward using Julia. If you have GPU that is supported (for example Nvidia with CUDA) then it is straightforward to load and run the neural network on the gpu. You add ”|> gpu” to load it on the GPU and ”|> cpu” to get it back on the CPU. Not all operations are supported on the GPU, so if you want to do things like indexing on the data, you should first get the data back on the CPU.

using CSV,DataFrames, Flux, JLD2, CUDA, MLUtils, Statistics

struct Dataset
    x::Array{Float64, 4}
    y::Vector{Int64}
end

train_file = "C:\Git\juliacode\Data\digit-recognizer\train.csv"
test_file = "C:\Git\juliacode\Data\\digit-recognizer\test.csv"
output_file = "C:\Git\juliacode\Data\\digit-recognizer\output.csv"

function load_train_data(file)
    # load data from CSV file and split in train and test set
    data=Tables.matrix(CSV.File(file))
    train, test = splitobs(data', at=0.8)
    train=train';
    test=test';

    y_train=train[:,1];
    train=train[:,2:785]./256;
    #the input data for the Flux.jl network should have the following dimensions (28,28,1, number_of_rows)
    x_train=zeros(28,28,size(train,1))
    for i = 1:size(train,1)
        x_train[:,:,i]=reshape(train[i,:],28,28)
    end
    x_train=reshape(x_train,28,28,1,:)  
    
    y_test=test[:,1];
    test=test[:,2:785]./256;
    x_test=zeros(28,28,size(test,1))
    for i = 1:size(test,1)
        x_test[:,:,i]=reshape(test[i,:],28,28)
    end
    x_test=reshape(x_test,28,28,1,:)  
    
    train=Dataset(x_train,y_train)
    test=Dataset(x_test,y_test)
    train,test
end

function load_kaggle_test_data(file)
    data=Tables.matrix(CSV.File(file))
    data=data[:,:]./256;
    x=zeros(28,28,size(data,1))
    for i = 1:size(data,1)
        x[:,:,i]=reshape(data[i,:],28,28)
    end
    x=reshape(x,28,28,1,:) 
    x
end

data_train, data_test=load_train_data(train_file);

After loading the data from the CSV file, we define a neural network that we will train with the MNIST data. In Python there are several solutions for neural networks such as PyTorch and TensorFlow. In Julia the most commonly used package for neural networks is Flux.jl. We first create a loader that can load batches of data for training the neural network. I also print one image to validate that the data wrangling from CSV file to image was successful

function loader(data=data_train; batchsize::Int=64)
    yhot = Flux.onehotbatch(data.y, 0:9)  # make a OneHotMatrix
    Flux.DataLoader((data.x, yhot); batchsize, shuffle=true) |> gpu
end

x1, y1 = first(loader()); 
println(size(x1))


using ImageCore, ImageInTerminal
image= x1[:,:,1,1]
#let's first print the value the image represents and then show the image
imageY = ( y1 |> cpu)[:,1]
println(Flux.onecold(imageY)-1) 
image .|> Gray |> transpose |> cpu

(28, 28, 1, 64)
0

#define the Flux.jl network
lenet = Chain(
    Conv((5, 5), 1=>6, relu),
    MaxPool((2, 2)),
    Conv((5, 5), 6=>16, relu),
    MaxPool((2, 2)),
    Flux.flatten,
    Dense(256 => 120, relu),
    Dense(120 => 84, relu), 
    Dense(84 => 10),
) |> gpu

Chain(
  Conv((5, 5), 1 => 6, relu),           # 156 parameters
  MaxPool((2, 2)),
  Conv((5, 5), 6 => 16, relu),          # 2_416 parameters
  MaxPool((2, 2)),
  Flux.flatten,
  Dense(256 => 120, relu),              # 30_840 parameters
  Dense(120 => 84, relu),               # 10_164 parameters
  Dense(84 => 10),                      # 850 parameters
)                   # Total: 10 arrays, 44_426 parameters, 2.086 KiB.

# let's define a function that calculates the loss and accuracy

function loss_and_accuracy(model, data=data_train)
    (x1,y1) = only(loader(data, batchsize=Int(size(data.x,4)))) 
    ŷ = model(x1)
    loss = Flux.logitcrossentropy(ŷ, y1)  
    acc = round(100 * mean(Flux.onecold(ŷ) .== Flux.onecold(y1)); digits=2)
    (; loss, acc) 
end

@show loss_and_accuracy(lenet);

loss_and_accuracy(lenet) = (loss = 2.3034132f0, acc = 15.6)

settings = (;
    eta = 3e-4,     
    lambda = 1e-2,  
    batchsize = 128,
    epochs = 30,
)
train_log = []


opt_rule = OptimiserChain(WeightDecay(settings.lambda), Adam(settings.eta))
opt_state = Flux.setup(opt_rule, lenet);

for epoch in 1:settings.epochs
    for (x,y) in loader(batchsize=settings.batchsize)
        grads = Flux.gradient(m -> Flux.logitcrossentropy(m(x), y), lenet)
        Flux.update!(opt_state, lenet, grads[1])
    end

    # Logging is printed every 5th epoch
    if epoch % 5 == 1
        loss, acc = loss_and_accuracy(lenet)
        test_loss, test_acc = loss_and_accuracy(lenet, data_test)
        @info "logging:" epoch acc test_acc
    end

end

#let's compare the output from the trained network with ground truth
y1hat = lenet(x1)
@show hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9))

hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9)) = [0 0; 8 8; 5 5; 2 2; 4 4; 8 8; 2 2; 6 6; 7 7
; 9 9; 5 5; 5 5; 9 9; 1 1; 8 8; 9 4; 3 3; 5 5; 2 7; 8 8; 1 1; 9 9; 6 6; 5 5; 7 7; 4 4; 2 2; 2 2; 1 1
; 5 5; 7 7; 9 4; 0 0; 6 6; 4 4; 6 6; 6 6; 5 5; 0 0; 3 3; 4 4; 9 5; 8 8; 7 7; 7 7; 9 9; 8 2; 4 4; 6 6
; 4 4; 0 0; 3 3; 0 0; 1 1; 3 3; 4 4; 9 9; 8 8; 3 3; 9 9; 7 7; 5 5; 8 8; 5 5]
64×2 CUDA.CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 0  0
 8  8
 5  5
 2  2
 4  4
 8  8
 2  2
 6  6
 7  7
 9  9
 ⋮  
 4  4
 9  9
 8  8
 3  3
 9  9
 7  7
 5  5
 8  8
 5  5

In order to check whether our trained network is any good, we can have it process the test data provided by Kaggle and submit the estimated classification to the Kaggle website. For this we need to save the labels together with the imageID’s in a CSV file

#compute classification for Kaggle test set and save it to CSV file so it can be submitted to the Kaggle website
x_kaggle = load_kaggle_test_data(test_file);
x_kaggle= Float32.(x_kaggle) |> gpu
y_kaggle= lenet(x_kaggle);
column2=Flux.onecold(y_kaggle, 0:9) |> cpu
column1=collect(1:length(column2)) |> cpu
output=hcat(column1, column2)
df=DataFrame( output,  ["ImageId","Label"])
CSV.write( output_file,df)

"C:\Git\juliacode\Data\\digit-recognizer\output.csv"