nix-ai/mk-flake.nix
2024-11-16 16:39:56 +01:00

578 lines
18 KiB
Nix

{ config, pkgs, pkgsNoCuda, lib, ... }: let
dataset = { lib, ... }: {
options = {
src = {
url = lib.mkOption {
description = "URL to download the dataset from";
type = with lib.types; nullOr str;
default = null;
example = "https://huggingface.co/datasets/NousResearch/CharacterCodex/resolve/main/character_codex.json?download=true";
};
hash = lib.mkOption {
description = "Hash of the dataset. Needed when dataset or url is set.";
type = with lib.types; nullOr str;
default = null;
example = "sha256-/HE4In/YBRYVdHXZKmqRIDWlAiUU0syWeWPJ+fwQcvk=";
};
path = lib.mkOption {
description = "Path to the dataset";
type = with lib.types; nullOr path;
default = null;
example = "/path/to/dataset.zip";
};
dataset = lib.mkOption {
description = "Huggingface dataset name. Make sure to set presets.huggingface to true.";
type = with lib.types; nullOr str;
default = null;
example = "HuggingFaceFW/fineweb";
};
};
prepare = {
GPU = lib.mkOption {
description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
type = lib.types.str;
default = "";
example = "H100-1";
};
directoryPath = lib.mkOption {
description = "Path to the directory with cleanup scripts";
type = with lib.types; nullOr path;
default = null;
};
commands = lib.mkOption {
description = "Commands to run to prepare the dataset";
type = lib.types.str;
default = "";
example = ''
unzip dataset.zip
'';
};
drop = lib.mkOption {
description = "Files to save after cleanup";
type = lib.types.listOf lib.types.str;
default = [ ];
};
timeout = lib.mkOption {
description = "Timeout for the dataset preparation";
type = lib.types.ints.unsigned;
default = 432000;
};
};
};
};
training = { lib, ... }: {
options = {
GPU = lib.mkOption {
description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
type = lib.types.str;
default = "any";
example = "H100-1";
};
directoryPath = lib.mkOption {
description = "Path to the directory with cleanup scripts";
type = lib.types.path;
};
mergeDirectories = lib.mkOption {
description = "Directories to merge from seperate trainings";
type = lib.types.listOf lib.types.str;
default = [ ];
example = [
"runs"
];
};
copyDatasets = lib.mkOption {
description = "The datasets to be available in the training directory";
type = lib.types.listOf lib.types.str;
default = [ ];
};
commands = lib.mkOption {
description = "Commands to run to prepare the dataset";
type = lib.types.str;
default = ''
unzip $dataset
'';
};
drop = lib.mkOption {
description = "Files to save after cleanup";
type = lib.types.listOf lib.types.str;
default = [ ];
example = [
"model.pt"
];
};
timeout = lib.mkOption {
description = "Timeout for the dataset preparation";
type = lib.types.ints.unsigned;
default = 432000;
};
configurations = lib.mkOption {
description = "config.json configurations";
type = lib.types.listOf (lib.types.attrsOf lib.types.anything);
default = [ ];
example = [
{
model = "minst-model";
dataset = "mnist";
layersA = [5 10];
layersB = [5 10];
epochs = 1;
}
{
model = "minst-model";
dataset = "mnist";
layersA = [1 15];
layersB = [1 15];
epochs = 1;
}
];
};
testConfiguration = lib.mkOption {
description = "config.json configuration for cpu testing";
type = lib.types.attrsOf lib.types.anything;
default = { };
example = {
model = "minst-model";
dataset = "mnist";
layersA = 1;
layersB = 1;
epochs = 1;
};
};
};
};
mkIDs = list: lib.imap0 (i: cfg: { id = i; } // cfg) list;
permuteAttrs = attrs: let
filtered = lib.filterAttrs (n: v: (builtins.typeOf v) == "list") attrs;
in
map (val: val // builtins.removeAttrs attrs ((builtins.attrNames filtered) ++ [ "id" ])) (lib.cartesianProduct filtered);
pythonPackages = lib.optionals config.presets.torch [
"numpy"
"torch"
"torch-tb-profiler"
"torchinfo"
"torchvision"
"tensorboard"
] ++ lib.optionals config.presets.jupyter [
"ipykernel"
"jupyter"
"notebook"
] ++ lib.optionals config.presets.datascience [
"pandas"
"scikit-learn"
"seaborn"
"matplotlib"
"umap-learn"
"plotly"
] ++ lib.optionals config.presets.huggingface [
"transformers"
"datasets"
] ++ config.pythonPackages;
packages = [
"wvls"
] ++ lib.optionals config.presets.torch [
"gcc"
] ++ lib.optionals config.presets.jupyter [
"jupyter"
] ++ config.packages;
environmentVariables = { HF_HOME = ".cache/huggingface"; } // config.environmentVariables;
cudaPackages = (with pkgs; [
cudatoolkit.out
cudatoolkit.lib
addOpenGLRunpath
autoAddDriverRunpath
]) ++ (map (v: pkgs.cudaPackages.${v}) config.extraCudaPackages);
rocmPackages = with pkgs; [
rocmPackages.clr
rocmPackages.miopen
rocmPackages.rocm-smi
rocmPackages.rocsparse
rocmPackages.rocsolver
rocmPackages.rocblas
rocmPackages.hipblas
rocmPackages.rocm-cmake
rocmPackages.hipfft
addOpenGLRunpath
autoAddDriverRunpath
];
zludaPackages = with pkgs; [
zluda
addOpenGLRunpath
autoAddDriverRunpath
];
pythonEnv = usePkgs: [ ((if config.usePython311 then usePkgs.python311.withPackages else usePkgs.python3.withPackages) (ps: (map (v: ps.${v}) pythonPackages))) ];
packaging = usePkgs: map (v: usePkgs.${v}) packages;
in {
options = {
flake = lib.mkOption {
description = "Flake configuration";
type = lib.types.attrsOf lib.types.anything;
default = { };
};
usePython311 = lib.mkEnableOption "Use Python 3.11";
trustRemoteCode = lib.mkOption {
description = "Whether to trust remote code";
type = lib.types.bool;
default = true;
};
useServiceServers = lib.mkOption {
description = "Whether to use Compute-Servers from Wavelens";
type = lib.types.bool;
default = true;
};
packages = lib.mkOption {
description = "Packages to install";
type = lib.types.listOf lib.types.str;
default = [ ];
example = [ "jq" ];
};
pythonPackages = lib.mkOption {
description = "Python requirements to install";
type = lib.types.listOf lib.types.str;
default = [ ];
example = [ "tqdm" ];
};
extraCudaPackages = lib.mkOption {
description = "Extra CUDA packages to install on hydra";
type = lib.types.listOf lib.types.str;
default = [ "cudnn" "nccl" ];
};
environmentVariables = lib.mkOption {
description = "Environment variables to set. Do not set HF_HOME when using Hugging Face datasets.";
type = lib.types.attrsOf lib.types.str;
default = { };
example = { MY_ENV = "value"; };
};
presets = let
mkEnableOption = name: lib.mkOption {
description = "${name} preset";
type = lib.types.bool;
default = false;
};
in {
datascience = mkEnableOption "Data Science";
jupyter = mkEnableOption "Jupyter";
torch = mkEnableOption "PyTorch";
huggingface = mkEnableOption "Hugging Face";
};
datasets = lib.mkOption {
description = "Datasets to prepare";
type = lib.types.attrsOf (lib.types.submodule dataset);
default = { };
};
trainings = lib.mkOption {
description = "Trainings to prepare";
type = lib.types.attrsOf (lib.types.submodule training);
default = { };
};
};
config = {
flake = let
forEachSystem = attr: builtins.listToAttrs (lib.map (system: {
name = system;
value = attr;
}) [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" ]);
in {
devShells = forEachSystem {
default = with pkgsNoCuda; mkShell {
buildInputs = [
stdenv.cc.cc.lib
pam
];
packages = (pythonEnv pkgsNoCuda)
++ (packaging pkgsNoCuda);
EXTRA_CCFLAGS = "-I/usr/include";
} // environmentVariables;
cuda = with pkgs; mkShell {
buildInputs = [
stdenv.cc.cc.lib
pam
];
packages = (pythonEnv pkgs)
++ (packaging pkgs)
++ cudaPackages;
CUDA_PATH = "${pkgs.cudatoolkit}";
EXTRA_CCFLAGS = "-I/usr/include";
} // environmentVariables;
rocm = with pkgs; mkShell {
buildInputs = [
stdenv.cc.cc.lib
pam
];
packages = (pythonEnv pkgs)
++ (packaging pkgs)
++ rocmPackages;
EXTRA_CCFLAGS = "-I/usr/include";
} // environmentVariables;
zluda = with pkgs; mkShell {
buildInputs = [
stdenv.cc.cc.lib
pam
];
packages = (pythonEnv pkgs)
++ (packaging pkgs)
++ zludaPackages;
EXTRA_CCFLAGS = "-I/usr/include";
} // environmentVariables;
};
hydraJobs = {
build.devShells.x86_64-linux.default = lib.hydraJob config.flake.devShells.x86_64-linux.default;
build.devShells.x86_64-linux.cuda = lib.hydraJob config.flake.devShells.x86_64-linux.cuda;
dataset = builtins.mapAttrs (na: cfg: pkgs.callPackage ({ runCommand, fetchurl, unzip, zip, util-linux }: runCommand "dataset" (let
gpu = cfg.prepare.GPU != "";
in {
__contentAddressed = true;
system = "x86_64-linux";
requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
++ (lib.optional (config.useServiceServers && gpu) "service")
++ (lib.optional (gpu && cfg.prepare.GPU != "any") "gpu-${cfg.prepare.GPU}");
meta = {
inherit (cfg.prepare) timeout;
maxSilent = cfg.prepare.timeout;
};
nativeBuildInputs = [
(pythonEnv pkgs)
util-linux
unzip
zip
] ++ (packaging pkgs)
++ lib.optionals gpu cudaPackages;
CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
EXTRA_CCFLAGS = "-I/usr/include";
exec = cfg.prepare.directoryPath;
dataset = if cfg.src.path != null then
cfg.src.path
else if cfg.src.dataset != null then
pkgs.fetchgit {
inherit (cfg.src) hash;
url = "https://huggingface.co/datasets/${cfg.src.dataset}";
deepClone = true;
fetchLFS = true;
}
else
fetchurl { inherit (cfg.src) url hash; };
} // environmentVariables) (''
mkdir -p $out/nix-support
echo "file dataset $out/dataset.zip" >> $out/nix-support/hydra-build-products
'' + (lib.optionalString (cfg.prepare.directoryPath != null) ''
cp -r $exec/. ./
'') + (lib.optionalString (cfg.src.dataset != null) ''
echo "import requests
from datasets import load_dataset
API_URL = "https://datasets-server.huggingface.co/splits?dataset=$dataset"
response = requests.get(API_URL).json()
splits = [ split["split"] for split in response["splits"] ]
dataset = load_dataset('$dataset', trust_remote_code=${ if config.trustRemoteCode then "True" else "False" }, split=splits)
" > get_dataset.py
python get_dataset.py
echo "Dataset downloaded"
'') + (lib.optionalString (cfg.src.path != null || cfg.src.url != null) ''
export datasetName=$(echo $dataset | rev | cut -d'/' -f1 | rev | cut -d'-' -f2-)
cp $dataset ./$datasetName
chmod -R 777 .
'') + (if (cfg.src.dataset != null && cfg.prepare.commands == "") then ''
cp -r $dataset/. ./${na}
zip -qr $out/dataset.zip ${na}
'' else ''
${cfg.prepare.commands}
zip -qr $out/dataset.zip ${ lib.concatStringsSep " " cfg.prepare.drop }
''))) { }) config.datasets;
test = lib.mapAttrs (na: cfg: (
pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand "model-test" ({
__contentAddressed = true;
system = "x86_64-linux";
meta = {
inherit (cfg) timeout;
maxSilent = cfg.timeout;
};
nativeBuildInputs = [
(pythonEnv pkgs)
unzip
zip
] ++ (packaging pkgs);
EXTRA_CCFLAGS = "-I/usr/include";
conf = builtins.toFile "config.json" (builtins.toJSON (cfg.testConfiguration // { id = 0; }));
exec = cfg.directoryPath;
} // builtins.listToAttrs (map (data: {
name = "dataset${data}";
value = config.flake.hydraJobs.dataset.${data}.out;
}) cfg.copyDatasets) // environmentVariables) ''
mkdir -p $out/nix-support
echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products
cp -r $exec/. ./
chmod -R 777 .
rm -f ./config.json
cp $conf ./config.json
${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
chmod -R 777 .
${cfg.commands}
zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
'')
{ }
)) (lib.filterAttrs (na: cfg: cfg.testConfiguration != { }) config.trainings);
train = lib.mapAttrs (na: cfg: builtins.listToAttrs (map (c: rec {
name = "model-${builtins.toString c.id}";
value = pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand name (let
gpu = cfg.GPU != "";
in {
__contentAddressed = true;
system = "x86_64-linux";
requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
++ (lib.optional (config.useServiceServers && gpu) "service")
++ (lib.optional (gpu && cfg.GPU != "any") "gpu-${cfg.GPU}");
meta = {
inherit (cfg) timeout;
maxSilent = cfg.timeout;
};
nativeBuildInputs = [
(pythonEnv pkgs)
unzip
zip
] ++ (packaging pkgs)
++ lib.optionals gpu cudaPackages
++ lib.optional (cfg.testConfiguration != { }) config.flake.hydraJobs.test.${na};
CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
EXTRA_CCFLAGS = "-I/usr/include";
conf = builtins.toFile "config.json" (builtins.toJSON c);
exec = cfg.directoryPath;
} // builtins.listToAttrs (map (data: {
name = "dataset${data}";
value = config.flake.hydraJobs.dataset.${data}.out;
}) cfg.copyDatasets) // environmentVariables) ''
mkdir -p $out/nix-support
echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products
cp -r $exec/. ./
chmod -R 777 .
rm -f ./config.json
cp $conf ./config.json
${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
chmod -R 777 .
python -c "import torch; print(f'CUDA AVAILABLE: {torch.cuda.is_available()}')"
${cfg.commands}
zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
'')
{ };
}) (mkIDs (builtins.concatMap permuteAttrs cfg.configurations)))) config.trainings;
train-combined = builtins.listToAttrs (map (name: {
inherit name;
value = pkgs.callPackage
({ stdenvNoCC, unzip, zip }: stdenvNoCC.mkDerivation
{
__contentAddressed = true;
name = "${name}-models.zip";
system = "x86_64-linux";
nativeBuildInputs = [
unzip
zip
];
models = toString (map (n: config.flake.hydraJobs.train.${name}.${n}.out) (builtins.attrNames config.flake.hydraJobs.train.${name}));
buildCommand = ''
mkdir -p $out/nix-support
export img=$out/${name}-models.zip
echo "file ai-model $img" >> $out/nix-support/hydra-build-products
mkdir models
for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
mkdir $merges
done
cd models
for model in $models; do
# split model name by - and take everything after the first element
modeln=$(echo $model | cut -d'-' -f2-)
mkdir $modeln
cd $modeln
unzip -q $model/train.zip
cd ..
for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
cp -Tr $modeln/$merges/ $merges
done
done
cd ..
zip -qr $img models
'';
}) { };
}) (builtins.filter (v: (builtins.length (builtins.attrNames config.trainings.${v})) > 1) (builtins.attrNames config.trainings)));
};
};
};
}