578 lines
18 KiB
Nix
578 lines
18 KiB
Nix
{ config, pkgs, pkgsNoCuda, lib, ... }: let
|
|
dataset = { lib, ... }: {
|
|
options = {
|
|
src = {
|
|
url = lib.mkOption {
|
|
description = "URL to download the dataset from";
|
|
type = with lib.types; nullOr str;
|
|
default = null;
|
|
example = "https://huggingface.co/datasets/NousResearch/CharacterCodex/resolve/main/character_codex.json?download=true";
|
|
};
|
|
|
|
hash = lib.mkOption {
|
|
description = "Hash of the dataset. Needed when dataset or url is set.";
|
|
type = with lib.types; nullOr str;
|
|
default = null;
|
|
example = "sha256-/HE4In/YBRYVdHXZKmqRIDWlAiUU0syWeWPJ+fwQcvk=";
|
|
};
|
|
|
|
path = lib.mkOption {
|
|
description = "Path to the dataset";
|
|
type = with lib.types; nullOr path;
|
|
default = null;
|
|
example = "/path/to/dataset.zip";
|
|
};
|
|
|
|
dataset = lib.mkOption {
|
|
description = "Huggingface dataset name. Make sure to set presets.huggingface to true.";
|
|
type = with lib.types; nullOr str;
|
|
default = null;
|
|
example = "HuggingFaceFW/fineweb";
|
|
};
|
|
};
|
|
|
|
prepare = {
|
|
GPU = lib.mkOption {
|
|
description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
|
|
type = lib.types.str;
|
|
default = "";
|
|
example = "H100-1";
|
|
};
|
|
|
|
directoryPath = lib.mkOption {
|
|
description = "Path to the directory with cleanup scripts";
|
|
type = with lib.types; nullOr path;
|
|
default = null;
|
|
};
|
|
|
|
commands = lib.mkOption {
|
|
description = "Commands to run to prepare the dataset";
|
|
type = lib.types.str;
|
|
default = "";
|
|
example = ''
|
|
unzip dataset.zip
|
|
'';
|
|
};
|
|
|
|
drop = lib.mkOption {
|
|
description = "Files to save after cleanup";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
};
|
|
|
|
timeout = lib.mkOption {
|
|
description = "Timeout for the dataset preparation";
|
|
type = lib.types.ints.unsigned;
|
|
default = 432000;
|
|
};
|
|
};
|
|
};
|
|
};
|
|
|
|
training = { lib, ... }: {
|
|
options = {
|
|
GPU = lib.mkOption {
|
|
description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
|
|
type = lib.types.str;
|
|
default = "any";
|
|
example = "H100-1";
|
|
};
|
|
|
|
directoryPath = lib.mkOption {
|
|
description = "Path to the directory with cleanup scripts";
|
|
type = lib.types.path;
|
|
};
|
|
|
|
mergeDirectories = lib.mkOption {
|
|
description = "Directories to merge from seperate trainings";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
example = [
|
|
"runs"
|
|
];
|
|
};
|
|
|
|
copyDatasets = lib.mkOption {
|
|
description = "The datasets to be available in the training directory";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
};
|
|
|
|
commands = lib.mkOption {
|
|
description = "Commands to run to prepare the dataset";
|
|
type = lib.types.str;
|
|
default = ''
|
|
unzip $dataset
|
|
'';
|
|
};
|
|
|
|
drop = lib.mkOption {
|
|
description = "Files to save after cleanup";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
example = [
|
|
"model.pt"
|
|
];
|
|
};
|
|
|
|
timeout = lib.mkOption {
|
|
description = "Timeout for the dataset preparation";
|
|
type = lib.types.ints.unsigned;
|
|
default = 432000;
|
|
};
|
|
|
|
configurations = lib.mkOption {
|
|
description = "config.json configurations";
|
|
type = lib.types.listOf (lib.types.attrsOf lib.types.anything);
|
|
default = [ ];
|
|
example = [
|
|
{
|
|
model = "minst-model";
|
|
dataset = "mnist";
|
|
layersA = [5 10];
|
|
layersB = [5 10];
|
|
epochs = 1;
|
|
}
|
|
{
|
|
model = "minst-model";
|
|
dataset = "mnist";
|
|
layersA = [1 15];
|
|
layersB = [1 15];
|
|
epochs = 1;
|
|
}
|
|
];
|
|
};
|
|
|
|
testConfiguration = lib.mkOption {
|
|
description = "config.json configuration for cpu testing";
|
|
type = lib.types.attrsOf lib.types.anything;
|
|
default = { };
|
|
example = {
|
|
model = "minst-model";
|
|
dataset = "mnist";
|
|
layersA = 1;
|
|
layersB = 1;
|
|
epochs = 1;
|
|
};
|
|
};
|
|
};
|
|
};
|
|
|
|
mkIDs = list: lib.imap0 (i: cfg: { id = i; } // cfg) list;
|
|
permuteAttrs = attrs: let
|
|
filtered = lib.filterAttrs (n: v: (builtins.typeOf v) == "list") attrs;
|
|
in
|
|
map (val: val // builtins.removeAttrs attrs ((builtins.attrNames filtered) ++ [ "id" ])) (lib.cartesianProduct filtered);
|
|
|
|
pythonPackages = lib.optionals config.presets.torch [
|
|
"numpy"
|
|
"torch"
|
|
"torch-tb-profiler"
|
|
"torchinfo"
|
|
"torchvision"
|
|
"tensorboard"
|
|
] ++ lib.optionals config.presets.jupyter [
|
|
"ipykernel"
|
|
"jupyter"
|
|
"notebook"
|
|
] ++ lib.optionals config.presets.datascience [
|
|
"pandas"
|
|
"scikit-learn"
|
|
"seaborn"
|
|
"matplotlib"
|
|
"umap-learn"
|
|
"plotly"
|
|
] ++ lib.optionals config.presets.huggingface [
|
|
"transformers"
|
|
"datasets"
|
|
] ++ config.pythonPackages;
|
|
|
|
packages = [
|
|
"wvls"
|
|
] ++ lib.optionals config.presets.torch [
|
|
"gcc"
|
|
] ++ lib.optionals config.presets.jupyter [
|
|
"jupyter"
|
|
] ++ config.packages;
|
|
|
|
environmentVariables = { HF_HOME = ".cache/huggingface"; } // config.environmentVariables;
|
|
|
|
cudaPackages = (with pkgs; [
|
|
cudatoolkit.out
|
|
cudatoolkit.lib
|
|
addOpenGLRunpath
|
|
autoAddDriverRunpath
|
|
]) ++ (map (v: pkgs.cudaPackages.${v}) config.extraCudaPackages);
|
|
|
|
rocmPackages = with pkgs; [
|
|
rocmPackages.clr
|
|
rocmPackages.miopen
|
|
rocmPackages.rocm-smi
|
|
rocmPackages.rocsparse
|
|
rocmPackages.rocsolver
|
|
rocmPackages.rocblas
|
|
rocmPackages.hipblas
|
|
rocmPackages.rocm-cmake
|
|
rocmPackages.hipfft
|
|
addOpenGLRunpath
|
|
autoAddDriverRunpath
|
|
];
|
|
|
|
zludaPackages = with pkgs; [
|
|
zluda
|
|
addOpenGLRunpath
|
|
autoAddDriverRunpath
|
|
];
|
|
|
|
pythonEnv = usePkgs: [ ((if config.usePython311 then usePkgs.python311.withPackages else usePkgs.python3.withPackages) (ps: (map (v: ps.${v}) pythonPackages))) ];
|
|
packaging = usePkgs: map (v: usePkgs.${v}) packages;
|
|
in {
|
|
options = {
|
|
flake = lib.mkOption {
|
|
description = "Flake configuration";
|
|
type = lib.types.attrsOf lib.types.anything;
|
|
default = { };
|
|
};
|
|
|
|
usePython311 = lib.mkEnableOption "Use Python 3.11";
|
|
|
|
trustRemoteCode = lib.mkOption {
|
|
description = "Whether to trust remote code";
|
|
type = lib.types.bool;
|
|
default = true;
|
|
};
|
|
|
|
useServiceServers = lib.mkOption {
|
|
description = "Whether to use Compute-Servers from Wavelens";
|
|
type = lib.types.bool;
|
|
default = true;
|
|
};
|
|
|
|
packages = lib.mkOption {
|
|
description = "Packages to install";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
example = [ "jq" ];
|
|
};
|
|
|
|
pythonPackages = lib.mkOption {
|
|
description = "Python requirements to install";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ ];
|
|
example = [ "tqdm" ];
|
|
};
|
|
|
|
extraCudaPackages = lib.mkOption {
|
|
description = "Extra CUDA packages to install on hydra";
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [ "cudnn" "nccl" ];
|
|
};
|
|
|
|
environmentVariables = lib.mkOption {
|
|
description = "Environment variables to set. Do not set HF_HOME when using Hugging Face datasets.";
|
|
type = lib.types.attrsOf lib.types.str;
|
|
default = { };
|
|
example = { MY_ENV = "value"; };
|
|
};
|
|
|
|
presets = let
|
|
mkEnableOption = name: lib.mkOption {
|
|
description = "${name} preset";
|
|
type = lib.types.bool;
|
|
default = false;
|
|
};
|
|
in {
|
|
datascience = mkEnableOption "Data Science";
|
|
jupyter = mkEnableOption "Jupyter";
|
|
torch = mkEnableOption "PyTorch";
|
|
huggingface = mkEnableOption "Hugging Face";
|
|
};
|
|
|
|
datasets = lib.mkOption {
|
|
description = "Datasets to prepare";
|
|
type = lib.types.attrsOf (lib.types.submodule dataset);
|
|
default = { };
|
|
};
|
|
|
|
trainings = lib.mkOption {
|
|
description = "Trainings to prepare";
|
|
type = lib.types.attrsOf (lib.types.submodule training);
|
|
default = { };
|
|
};
|
|
};
|
|
|
|
config = {
|
|
flake = let
|
|
forEachSystem = attr: builtins.listToAttrs (lib.map (system: {
|
|
name = system;
|
|
value = attr;
|
|
}) [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" ]);
|
|
in {
|
|
devShells = forEachSystem {
|
|
default = with pkgsNoCuda; mkShell {
|
|
buildInputs = [
|
|
stdenv.cc.cc.lib
|
|
pam
|
|
];
|
|
|
|
packages = (pythonEnv pkgsNoCuda)
|
|
++ (packaging pkgsNoCuda);
|
|
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
} // environmentVariables;
|
|
|
|
cuda = with pkgs; mkShell {
|
|
buildInputs = [
|
|
stdenv.cc.cc.lib
|
|
pam
|
|
];
|
|
|
|
packages = (pythonEnv pkgs)
|
|
++ (packaging pkgs)
|
|
++ cudaPackages;
|
|
|
|
CUDA_PATH = "${pkgs.cudatoolkit}";
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
} // environmentVariables;
|
|
|
|
rocm = with pkgs; mkShell {
|
|
buildInputs = [
|
|
stdenv.cc.cc.lib
|
|
pam
|
|
];
|
|
|
|
packages = (pythonEnv pkgs)
|
|
++ (packaging pkgs)
|
|
++ rocmPackages;
|
|
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
} // environmentVariables;
|
|
|
|
zluda = with pkgs; mkShell {
|
|
buildInputs = [
|
|
stdenv.cc.cc.lib
|
|
pam
|
|
];
|
|
|
|
packages = (pythonEnv pkgs)
|
|
++ (packaging pkgs)
|
|
++ zludaPackages;
|
|
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
} // environmentVariables;
|
|
};
|
|
|
|
hydraJobs = {
|
|
build.devShells.x86_64-linux.default = lib.hydraJob config.flake.devShells.x86_64-linux.default;
|
|
build.devShells.x86_64-linux.cuda = lib.hydraJob config.flake.devShells.x86_64-linux.cuda;
|
|
dataset = builtins.mapAttrs (na: cfg: pkgs.callPackage ({ runCommand, fetchurl, unzip, zip, util-linux }: runCommand "dataset" (let
|
|
gpu = cfg.prepare.GPU != "";
|
|
in {
|
|
__contentAddressed = true;
|
|
system = "x86_64-linux";
|
|
requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
|
|
++ (lib.optional (config.useServiceServers && gpu) "service")
|
|
++ (lib.optional (gpu && cfg.prepare.GPU != "any") "gpu-${cfg.prepare.GPU}");
|
|
|
|
meta = {
|
|
inherit (cfg.prepare) timeout;
|
|
maxSilent = cfg.prepare.timeout;
|
|
};
|
|
|
|
nativeBuildInputs = [
|
|
(pythonEnv pkgs)
|
|
util-linux
|
|
unzip
|
|
zip
|
|
] ++ (packaging pkgs)
|
|
++ lib.optionals gpu cudaPackages;
|
|
|
|
CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
|
|
exec = cfg.prepare.directoryPath;
|
|
|
|
dataset = if cfg.src.path != null then
|
|
cfg.src.path
|
|
else if cfg.src.dataset != null then
|
|
pkgs.fetchgit {
|
|
inherit (cfg.src) hash;
|
|
url = "https://huggingface.co/datasets/${cfg.src.dataset}";
|
|
deepClone = true;
|
|
fetchLFS = true;
|
|
}
|
|
else
|
|
fetchurl { inherit (cfg.src) url hash; };
|
|
} // environmentVariables) (''
|
|
mkdir -p $out/nix-support
|
|
echo "file dataset $out/dataset.zip" >> $out/nix-support/hydra-build-products
|
|
'' + (lib.optionalString (cfg.prepare.directoryPath != null) ''
|
|
cp -r $exec/. ./
|
|
'') + (lib.optionalString (cfg.src.dataset != null) ''
|
|
echo "import requests
|
|
from datasets import load_dataset
|
|
|
|
API_URL = "https://datasets-server.huggingface.co/splits?dataset=$dataset"
|
|
response = requests.get(API_URL).json()
|
|
|
|
splits = [ split["split"] for split in response["splits"] ]
|
|
|
|
dataset = load_dataset('$dataset', trust_remote_code=${ if config.trustRemoteCode then "True" else "False" }, split=splits)
|
|
" > get_dataset.py
|
|
|
|
python get_dataset.py
|
|
echo "Dataset downloaded"
|
|
'') + (lib.optionalString (cfg.src.path != null || cfg.src.url != null) ''
|
|
export datasetName=$(echo $dataset | rev | cut -d'/' -f1 | rev | cut -d'-' -f2-)
|
|
|
|
cp $dataset ./$datasetName
|
|
chmod -R 777 .
|
|
'') + (if (cfg.src.dataset != null && cfg.prepare.commands == "") then ''
|
|
cp -r $dataset/. ./${na}
|
|
zip -qr $out/dataset.zip ${na}
|
|
'' else ''
|
|
${cfg.prepare.commands}
|
|
|
|
zip -qr $out/dataset.zip ${ lib.concatStringsSep " " cfg.prepare.drop }
|
|
''))) { }) config.datasets;
|
|
|
|
test = lib.mapAttrs (na: cfg: (
|
|
pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand "model-test" ({
|
|
__contentAddressed = true;
|
|
system = "x86_64-linux";
|
|
meta = {
|
|
inherit (cfg) timeout;
|
|
maxSilent = cfg.timeout;
|
|
};
|
|
|
|
nativeBuildInputs = [
|
|
(pythonEnv pkgs)
|
|
unzip
|
|
zip
|
|
] ++ (packaging pkgs);
|
|
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
|
|
conf = builtins.toFile "config.json" (builtins.toJSON (cfg.testConfiguration // { id = 0; }));
|
|
exec = cfg.directoryPath;
|
|
} // builtins.listToAttrs (map (data: {
|
|
name = "dataset${data}";
|
|
value = config.flake.hydraJobs.dataset.${data}.out;
|
|
}) cfg.copyDatasets) // environmentVariables) ''
|
|
mkdir -p $out/nix-support
|
|
echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products
|
|
|
|
cp -r $exec/. ./
|
|
chmod -R 777 .
|
|
rm -f ./config.json
|
|
cp $conf ./config.json
|
|
|
|
${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
|
|
chmod -R 777 .
|
|
|
|
${cfg.commands}
|
|
|
|
zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
|
|
'')
|
|
{ }
|
|
)) (lib.filterAttrs (na: cfg: cfg.testConfiguration != { }) config.trainings);
|
|
|
|
train = lib.mapAttrs (na: cfg: builtins.listToAttrs (map (c: rec {
|
|
name = "model-${builtins.toString c.id}";
|
|
value = pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand name (let
|
|
gpu = cfg.GPU != "";
|
|
in {
|
|
__contentAddressed = true;
|
|
system = "x86_64-linux";
|
|
requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
|
|
++ (lib.optional (config.useServiceServers && gpu) "service")
|
|
++ (lib.optional (gpu && cfg.GPU != "any") "gpu-${cfg.GPU}");
|
|
|
|
meta = {
|
|
inherit (cfg) timeout;
|
|
maxSilent = cfg.timeout;
|
|
};
|
|
|
|
nativeBuildInputs = [
|
|
(pythonEnv pkgs)
|
|
unzip
|
|
zip
|
|
] ++ (packaging pkgs)
|
|
++ lib.optionals gpu cudaPackages
|
|
++ lib.optional (cfg.testConfiguration != { }) config.flake.hydraJobs.test.${na};
|
|
|
|
CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
|
|
EXTRA_CCFLAGS = "-I/usr/include";
|
|
|
|
conf = builtins.toFile "config.json" (builtins.toJSON c);
|
|
exec = cfg.directoryPath;
|
|
} // builtins.listToAttrs (map (data: {
|
|
name = "dataset${data}";
|
|
value = config.flake.hydraJobs.dataset.${data}.out;
|
|
}) cfg.copyDatasets) // environmentVariables) ''
|
|
mkdir -p $out/nix-support
|
|
echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products
|
|
|
|
cp -r $exec/. ./
|
|
chmod -R 777 .
|
|
rm -f ./config.json
|
|
cp $conf ./config.json
|
|
|
|
${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
|
|
chmod -R 777 .
|
|
|
|
python -c "import torch; print(f'CUDA AVAILABLE: {torch.cuda.is_available()}')"
|
|
|
|
${cfg.commands}
|
|
|
|
zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
|
|
'')
|
|
{ };
|
|
}) (mkIDs (builtins.concatMap permuteAttrs cfg.configurations)))) config.trainings;
|
|
|
|
train-combined = builtins.listToAttrs (map (name: {
|
|
inherit name;
|
|
value = pkgs.callPackage
|
|
({ stdenvNoCC, unzip, zip }: stdenvNoCC.mkDerivation
|
|
{
|
|
__contentAddressed = true;
|
|
name = "${name}-models.zip";
|
|
system = "x86_64-linux";
|
|
|
|
nativeBuildInputs = [
|
|
unzip
|
|
zip
|
|
];
|
|
|
|
models = toString (map (n: config.flake.hydraJobs.train.${name}.${n}.out) (builtins.attrNames config.flake.hydraJobs.train.${name}));
|
|
|
|
buildCommand = ''
|
|
mkdir -p $out/nix-support
|
|
export img=$out/${name}-models.zip
|
|
echo "file ai-model $img" >> $out/nix-support/hydra-build-products
|
|
mkdir models
|
|
for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
|
|
mkdir $merges
|
|
done
|
|
cd models
|
|
for model in $models; do
|
|
# split model name by - and take everything after the first element
|
|
modeln=$(echo $model | cut -d'-' -f2-)
|
|
mkdir $modeln
|
|
cd $modeln
|
|
unzip -q $model/train.zip
|
|
cd ..
|
|
for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
|
|
cp -Tr $modeln/$merges/ $merges
|
|
done
|
|
done
|
|
cd ..
|
|
zip -qr $img models
|
|
'';
|
|
}) { };
|
|
}) (builtins.filter (v: (builtins.length (builtins.attrNames config.trainings.${v})) > 1) (builtins.attrNames config.trainings)));
|
|
};
|
|
};
|
|
};
|
|
}
|