nix-ai/mk-flake.nix

{ config, pkgs, pkgsNoCuda, lib, ... }: let
  dataset = { lib, ... }: {
    options = {
      src = {
        url = lib.mkOption {
          description = "URL to download the dataset from";
          type = with lib.types; nullOr str;
          default = null;
          example = "https://huggingface.co/datasets/NousResearch/CharacterCodex/resolve/main/character_codex.json?download=true";
        };

        hash = lib.mkOption {
          description = "Hash of the dataset. Needed when dataset or url is set.";
          type = with lib.types; nullOr str;
          default = null;
          example = "sha256-/HE4In/YBRYVdHXZKmqRIDWlAiUU0syWeWPJ+fwQcvk=";
        };

        path = lib.mkOption {
          description = "Path to the dataset";
          type = with lib.types; nullOr path;
          default = null;
          example = "/path/to/dataset.zip";
        };

        dataset = lib.mkOption {
          description = "Huggingface dataset name. Make sure to set presets.huggingface to true.";
          type = with lib.types; nullOr str;
          default = null;
          example = "HuggingFaceFW/fineweb";
        };
      };

      prepare = {
        GPU = lib.mkOption {
          description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
          type = lib.types.str;
          default = "";
          example = "H100-1";
        };

        directoryPath = lib.mkOption {
          description = "Path to the directory with cleanup scripts";
          type = with lib.types; nullOr path;
          default = null;
        };

        commands = lib.mkOption {
          description = "Commands to run to prepare the dataset";
          type = lib.types.str;
          default = "";
          example = ''
            unzip dataset.zip
          '';
        };

        drop = lib.mkOption {
          description = "Files to save after cleanup";
          type = lib.types.listOf lib.types.str;
          default = [ ];
        };

        timeout = lib.mkOption {
          description = "Timeout for the dataset preparation";
          type = lib.types.ints.unsigned;
          default = 432000;
        };
      };
    };
  };

  training = { lib, ... }: {
    options = {
      GPU = lib.mkOption {
        description = "Choose the GPU to use. Leave empty for CPU. Use 'any' for any GPU";
        type = lib.types.str;
        default = "any";
        example = "H100-1";
      };

      directoryPath = lib.mkOption {
        description = "Path to the directory with cleanup scripts";
        type = lib.types.path;
      };

      mergeDirectories = lib.mkOption {
        description = "Directories to merge from seperate trainings";
        type = lib.types.listOf lib.types.str;
        default = [ ];
        example = [
          "runs"
        ];
      };

      copyDatasets = lib.mkOption {
        description = "The datasets to be available in the training directory";
        type = lib.types.listOf lib.types.str;
        default = [ ];
      };

      commands = lib.mkOption {
        description = "Commands to run to prepare the dataset";
        type = lib.types.str;
        default = ''
          unzip $dataset
        '';
      };

      drop = lib.mkOption {
        description = "Files to save after cleanup";
        type = lib.types.listOf lib.types.str;
        default = [ ];
        example = [
          "model.pt"
        ];
      };

      timeout = lib.mkOption {
        description = "Timeout for the dataset preparation";
        type = lib.types.ints.unsigned;
        default = 432000;
      };

      configurations = lib.mkOption {
        description = "config.json configurations";
        type = lib.types.listOf (lib.types.attrsOf lib.types.anything);
        default = [ ];
        example = [
          {
            model = "minst-model";
            dataset = "mnist";
            layersA = [5 10];
            layersB = [5 10];
            epochs = 1;
          }
          {
            model = "minst-model";
            dataset = "mnist";
            layersA = [1 15];
            layersB = [1 15];
            epochs = 1;
          }
        ];
      };

      testConfiguration = lib.mkOption {
        description = "config.json configuration for cpu testing";
        type = lib.types.attrsOf lib.types.anything;
        default = { };
        example = {
          model = "minst-model";
          dataset = "mnist";
          layersA = 1;
          layersB = 1;
          epochs = 1;
        };
      };
    };
  };

  mkIDs = list: lib.imap0 (i: cfg: { id = i; } // cfg) list;
  permuteAttrs = attrs: let
    filtered = lib.filterAttrs (n: v: (builtins.typeOf v) == "list") attrs;
  in
    map (val: val // builtins.removeAttrs attrs ((builtins.attrNames filtered) ++ [ "id" ])) (lib.cartesianProduct filtered);

  pythonPackages = lib.optionals config.presets.torch [
    "numpy"
    "torch"
    "torch-tb-profiler"
    "torchinfo"
    "torchvision"
    "tensorboard"
  ] ++ lib.optionals config.presets.jupyter [
    "ipykernel"
    "jupyter"
    "notebook"
  ] ++ lib.optionals config.presets.datascience [
    "pandas"
    "scikit-learn"
    "seaborn"
    "matplotlib"
    "umap-learn"
    "plotly"
  ] ++ lib.optionals config.presets.huggingface [
    "transformers"
    "datasets"
  ] ++ config.pythonPackages;

  packages = [
    "wvls"
  ] ++ lib.optionals config.presets.torch [
    "gcc"
  ] ++ lib.optionals config.presets.jupyter [
    "jupyter"
  ] ++ config.packages;

  environmentVariables = { HF_HOME = ".cache/huggingface"; } // config.environmentVariables;

  cudaPackages = (with pkgs; [
    cudatoolkit.out
    cudatoolkit.lib
    addOpenGLRunpath
    autoAddDriverRunpath
  ]) ++ (map (v: pkgs.cudaPackages.${v}) config.extraCudaPackages);

  rocmPackages = with pkgs; [
    rocmPackages.clr
    rocmPackages.miopen
    rocmPackages.rocm-smi
    rocmPackages.rocsparse
    rocmPackages.rocsolver
    rocmPackages.rocblas
    rocmPackages.hipblas
    rocmPackages.rocm-cmake
    rocmPackages.hipfft
    addOpenGLRunpath
    autoAddDriverRunpath
  ];

  zludaPackages = with pkgs; [
    zluda
    addOpenGLRunpath
    autoAddDriverRunpath
  ];

  pythonEnv = usePkgs: [ ((if config.usePython311 then usePkgs.python311.withPackages else usePkgs.python3.withPackages) (ps: (map (v: ps.${v}) pythonPackages))) ];
  packaging = usePkgs: map (v: usePkgs.${v}) packages;
in {
  options = {
    flake = lib.mkOption {
      description = "Flake configuration";
      type = lib.types.attrsOf lib.types.anything;
      default = { };
    };

    usePython311 = lib.mkEnableOption "Use Python 3.11";

    trustRemoteCode = lib.mkOption {
      description = "Whether to trust remote code";
      type = lib.types.bool;
      default = true;
    };

    useServiceServers = lib.mkOption {
      description = "Whether to use Compute-Servers from Wavelens";
      type = lib.types.bool;
      default = true;
    };

    packages = lib.mkOption {
      description = "Packages to install";
      type = lib.types.listOf lib.types.str;
      default = [ ];
      example = [ "jq" ];
    };

    pythonPackages = lib.mkOption {
      description = "Python requirements to install";
      type = lib.types.listOf lib.types.str;
      default = [ ];
      example = [ "tqdm" ];
    };

    extraCudaPackages = lib.mkOption {
      description = "Extra CUDA packages to install on hydra";
      type = lib.types.listOf lib.types.str;
      default = [ "cudnn" "nccl" ];
    };

    environmentVariables = lib.mkOption {
      description = "Environment variables to set. Do not set HF_HOME when using Hugging Face datasets.";
      type = lib.types.attrsOf lib.types.str;
      default = { };
      example = { MY_ENV = "value"; };
    };

    presets = let
      mkEnableOption = name: lib.mkOption {
        description = "${name} preset";
        type = lib.types.bool;
        default = false;
      };
    in {
      datascience = mkEnableOption "Data Science";
      jupyter = mkEnableOption "Jupyter";
      torch = mkEnableOption "PyTorch";
      huggingface = mkEnableOption "Hugging Face";
    };

    datasets = lib.mkOption {
      description = "Datasets to prepare";
      type = lib.types.attrsOf (lib.types.submodule dataset);
      default = { };
    };

    trainings = lib.mkOption {
      description = "Trainings to prepare";
      type = lib.types.attrsOf (lib.types.submodule training);
      default = { };
    };
  };

  config = {
    flake = let
        forEachSystem = attr: builtins.listToAttrs (lib.map (system: {
          name = system;
          value = attr;
        }) [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" ]);
      in {
      devShells = forEachSystem {
        default = with pkgsNoCuda; mkShell {
          buildInputs = [
            stdenv.cc.cc.lib
            pam
          ];

          packages = (pythonEnv pkgsNoCuda)
            ++ (packaging pkgsNoCuda);

          EXTRA_CCFLAGS = "-I/usr/include";
        } // environmentVariables;

        cuda = with pkgs; mkShell {
          buildInputs = [
            stdenv.cc.cc.lib
            pam
          ];

          packages = (pythonEnv pkgs)
            ++ (packaging pkgs)
            ++ cudaPackages;

          CUDA_PATH = "${pkgs.cudatoolkit}";
          EXTRA_CCFLAGS = "-I/usr/include";
        } // environmentVariables;

        rocm = with pkgs; mkShell {
          buildInputs = [
            stdenv.cc.cc.lib
            pam
          ];

          packages = (pythonEnv pkgs)
            ++ (packaging pkgs)
            ++ rocmPackages;

          EXTRA_CCFLAGS = "-I/usr/include";
        } // environmentVariables;

        zluda = with pkgs; mkShell {
          buildInputs = [
            stdenv.cc.cc.lib
            pam
          ];

          packages = (pythonEnv pkgs)
            ++ (packaging pkgs)
            ++ zludaPackages;

          EXTRA_CCFLAGS = "-I/usr/include";
        } // environmentVariables;
      };

      hydraJobs = {
        build.devShells.x86_64-linux.default = lib.hydraJob config.flake.devShells.x86_64-linux.default;
        build.devShells.x86_64-linux.cuda = lib.hydraJob config.flake.devShells.x86_64-linux.cuda;
        dataset = builtins.mapAttrs (na: cfg: pkgs.callPackage ({ runCommand, fetchurl, unzip, zip, util-linux }: runCommand "dataset" (let
          gpu = cfg.prepare.GPU != "";
        in {
          __contentAddressed = true;
          system = "x86_64-linux";
          requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
            ++ (lib.optional (config.useServiceServers && gpu) "service")
            ++ (lib.optional (gpu && cfg.prepare.GPU != "any") "gpu-${cfg.prepare.GPU}");

          meta = {
            inherit (cfg.prepare) timeout;
            maxSilent = cfg.prepare.timeout;
          };

          nativeBuildInputs = [
            (pythonEnv pkgs)
            util-linux
            unzip
            zip
          ] ++ (packaging pkgs)
            ++ lib.optionals gpu cudaPackages;

          CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
          EXTRA_CCFLAGS = "-I/usr/include";

          exec = cfg.prepare.directoryPath;

          dataset = if cfg.src.path != null then
            cfg.src.path
          else if cfg.src.dataset != null then
            pkgs.fetchgit {
              inherit (cfg.src) hash;
              url = "https://huggingface.co/datasets/${cfg.src.dataset}";
              deepClone = true;
              fetchLFS = true;
            }
          else
            fetchurl { inherit (cfg.src) url hash; };
        } // environmentVariables) (''
          mkdir -p $out/nix-support
          echo "file dataset $out/dataset.zip" >> $out/nix-support/hydra-build-products
        '' + (lib.optionalString (cfg.prepare.directoryPath != null) ''
          cp -r $exec/. ./
        '') + (lib.optionalString (cfg.src.dataset != null) ''
          echo "import requests
          from datasets import load_dataset

          API_URL = "https://datasets-server.huggingface.co/splits?dataset=$dataset"
          response = requests.get(API_URL).json()

          splits = [ split["split"] for split in response["splits"] ]

          dataset = load_dataset('$dataset', trust_remote_code=${ if config.trustRemoteCode then "True" else "False" }, split=splits)
          " > get_dataset.py

          python get_dataset.py
          echo "Dataset downloaded"
        '') + (lib.optionalString (cfg.src.path != null || cfg.src.url != null) ''
          export datasetName=$(echo $dataset | rev | cut -d'/' -f1 | rev | cut -d'-' -f2-)

          cp $dataset ./$datasetName
          chmod -R 777 .
        '') + (if (cfg.src.dataset != null && cfg.prepare.commands == "") then ''
          cp -r $dataset/. ./${na}
          zip -qr $out/dataset.zip ${na}
        '' else ''
          ${cfg.prepare.commands}

          zip -qr $out/dataset.zip ${ lib.concatStringsSep " " cfg.prepare.drop }
        ''))) { }) config.datasets;

        test = lib.mapAttrs (na: cfg: (
          pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand "model-test" ({
            __contentAddressed = true;
            system = "x86_64-linux";
            meta = {
              inherit (cfg) timeout;
              maxSilent = cfg.timeout;
            };

            nativeBuildInputs = [
              (pythonEnv pkgs)
              unzip
              zip
            ] ++ (packaging pkgs);

            EXTRA_CCFLAGS = "-I/usr/include";

            conf = builtins.toFile "config.json" (builtins.toJSON (cfg.testConfiguration // { id = 0; }));
            exec = cfg.directoryPath;
          } // builtins.listToAttrs (map (data: {
            name = "dataset${data}";
            value = config.flake.hydraJobs.dataset.${data}.out;
          }) cfg.copyDatasets) // environmentVariables) ''
            mkdir -p $out/nix-support
            echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products

            cp -r $exec/. ./
            chmod -R 777 .
            rm -f ./config.json
            cp $conf ./config.json

            ${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
            chmod -R 777 .

            ${cfg.commands}

            zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
          '')
          { }
        )) (lib.filterAttrs (na: cfg: cfg.testConfiguration != { }) config.trainings);

        train = lib.mapAttrs (na: cfg: builtins.listToAttrs (map (c: rec {
          name = "model-${builtins.toString c.id}";
          value = pkgs.callPackage ({ runCommand, fetchurl, unzip, zip }: runCommand name (let
            gpu = cfg.GPU != "";
          in {
            __contentAddressed = true;
            system = "x86_64-linux";
            requiredSystemFeatures = (lib.optionals gpu [ "cuda" ])
              ++ (lib.optional (config.useServiceServers && gpu) "service")
              ++ (lib.optional (gpu && cfg.GPU != "any") "gpu-${cfg.GPU}");

            meta = {
              inherit (cfg) timeout;
              maxSilent = cfg.timeout;
            };

            nativeBuildInputs = [
              (pythonEnv pkgs)
              unzip
              zip
            ] ++ (packaging pkgs)
              ++ lib.optionals gpu cudaPackages
              ++ lib.optional (cfg.testConfiguration != { }) config.flake.hydraJobs.test.${na};

            CUDA_PATH = lib.optionalString gpu pkgs.cudatoolkit;
            EXTRA_CCFLAGS = "-I/usr/include";

            conf = builtins.toFile "config.json" (builtins.toJSON c);
            exec = cfg.directoryPath;
          } // builtins.listToAttrs (map (data: {
            name = "dataset${data}";
            value = config.flake.hydraJobs.dataset.${data}.out;
          }) cfg.copyDatasets) // environmentVariables) ''
            mkdir -p $out/nix-support
            echo "file ai-model $out/train.zip" >> $out/nix-support/hydra-build-products

            cp -r $exec/. ./
            chmod -R 777 .
            rm -f ./config.json
            cp $conf ./config.json

            ${ lib.concatStringsSep "\n" (map (v: lib.concatStringsSep "" [ "unzip -q $dataset" v "/dataset.zip -d ." ] ) cfg.copyDatasets) }
            chmod -R 777 .

            python -c "import torch; print(f'CUDA AVAILABLE: {torch.cuda.is_available()}')"

            ${cfg.commands}

            zip -qr $out/train.zip ${ lib.concatStringsSep " " cfg.drop } config.json
          '')
          { };
        }) (mkIDs (builtins.concatMap permuteAttrs cfg.configurations)))) config.trainings;

        train-combined = builtins.listToAttrs (map (name: {
          inherit name;
          value = pkgs.callPackage
            ({ stdenvNoCC, unzip, zip }: stdenvNoCC.mkDerivation
              {
                __contentAddressed = true;
                name = "${name}-models.zip";
                system = "x86_64-linux";

                nativeBuildInputs = [
                  unzip
                  zip
                ];

                models = toString (map (n: config.flake.hydraJobs.train.${name}.${n}.out) (builtins.attrNames config.flake.hydraJobs.train.${name}));

                buildCommand =  ''
                  mkdir -p $out/nix-support
                  export img=$out/${name}-models.zip
                  echo "file ai-model $img" >> $out/nix-support/hydra-build-products
                  mkdir models
                  for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
                    mkdir $merges
                  done
                  cd models
                  for model in $models; do
                    # split model name by - and take everything after the first element
                    modeln=$(echo $model | cut -d'-' -f2-)
                    mkdir $modeln
                    cd $modeln
                    unzip -q $model/train.zip
                    cd ..
                    for merges in ${ toString config.trainings.${name}.mergeDirectories }; do
                      cp -Tr $modeln/$merges/ $merges
                    done
                  done
                  cd ..
                  zip -qr $img models
                '';
            }) { };
        }) (builtins.filter (v: (builtins.length (builtins.attrNames config.trainings.${v})) > 1) (builtins.attrNames config.trainings)));
      };
    };
  };
}