Commit 7cba63b5 authored by Adrian Marti's avatar Adrian Marti
Browse files

Tweak packaging, realized my hashing code is bs, need to change that

parent c562fd2c
Copyright Author name here (c) 2019
Copyright Adrian Marti (c) 2019
All rights reserved.
......@@ -13,7 +13,7 @@ modification, are permitted provided that the following conditions are met:
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Author name here nor the names of other
* Neither the name of Adrian Marti nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
......
......@@ -2,4 +2,8 @@
Iterate quickly by using Haskell where it excells.
The idea is using Python as an imperative data manipulation language and moving all the higher level workflow code to Haskell.
The idea is using Python as an imperative data manipulation language and moving all the higher level workflow code to Haskell. More concretely, you specify where your python functions lie, and then you can compose and apply those python objects with each other, by letting iterbuild generate the corresponding boilerplate code. Notice that this is strictly weaker than having a foreign function interface, since you never actually run python code, you only generate it.
## Development
It gets annoying that the nix function `fetchTarball` clears its cache often. To mitigate this, you can run `nix-prefectch-url --unpack [url for currently used haskell.nix commit, see default.nix]`, this will store the `haskell.nix` source in the nix store.
{ pkgs ? import <nixpkgs> (import (builtins.fetchTarball https://github.com/input-output-hk/haskell.nix/archive/master.tar.gz))
{ pkgs ? import <nixpkgs> (import (builtins.fetchTarball {
url=https://github.com/input-output-hk/haskell.nix/archive/28b23cc7de5a1a89160e83b030fe9b3ce8.tar.gz;
sha256="04z49bc7gh4c5f9mvj31zv8kjvbq3c9wihl88rzlxlpwsqsivc1w";
})),
kaggle ? (import ./requirements.nix { pkgs=pkgs; }).packages.kaggle
}:
let
kaggle = (import ./requirements.nix { pkgs=pkgs; }).packages.kaggle;
in
pkgs.haskell-nix.stackProject {
src = (pkgs.haskell-nix.haskellLib.cleanGit { src = ./.; }).outPath;
modules = [{
# Not sure pkgconfig is intended for this, but what should I do, it works and the attribute does not have documentation.
# I found it after reading the source code for a considerable amount of time.
packages.iterbuild.components.library.pkgconfig = [[pkgs.which pkgs.git kaggle]];
packages.iterbuild.components.library.postUnpack = ''
substituteInPlace iterbuild/src/Capabilities.hs --replace "\"git\"" "\"$(which git)"\"
substituteInPlace iterbuild/src/Capabilities.hs --replace "\"kaggle\"" "\"$(which kaggle)"\"
'';
packages.iterbuild.dontPatchELF = false; # this does not remove unnecessary deps, dont feel like debugging it now
}];
pkgs.haskell-nix.stackProject {
src = (pkgs.haskell-nix.haskellLib.cleanGit { src = ./.; }).outPath;
modules = [{
# Not sure pkgconfig is intended for this, but what should I do, it works and the attribute does not have documentation.
# I found it after reading the source code for a considerable amount of time.
packages.iterbuild.components.library.pkgconfig = [[pkgs.which pkgs.git pkgs.unzip kaggle]];
packages.iterbuild.components.library.postUnpack = ''
substituteInPlace iterbuild/src/Capabilities.hs --replace "\"git\"" "\"$(which git)"\"
substituteInPlace iterbuild/src/Capabilities.hs --replace "\"kaggle\"" "\"$(which kaggle)"\"
substituteInPlace iterbuild/src/Capabilities.hs --replace "\"unzip\"" "\"$(which unzip)"\"
# cat iterbuild/src/Capabilities.hs
'';
packages.iterbuild.dontPatchELF = false; # this does not remove unnecessary deps, dont feel like chasing unnecessary deps now
}];
}
# For some reason, this returns a set of haskell packages (analogous to the one at
# nixpkgs.haskellPackages), which has iterbuild and all its dependencies available, the dependencies
......
......@@ -4,7 +4,7 @@ cabal-version: 1.12
--
-- see: https://github.com/sol/hpack
--
-- hash: b1d0c90d22fc2eb0af689394f4a6bd084f2c3cc3f2ce66ccf0a1b7cfe02cc16c
-- hash: 48671a87ee2f36c0303a56c4f763293b5ec2b7c4eac9bef801e7a52aff869443
name: iterbuild
version: 0.1.0.0
......@@ -38,7 +38,9 @@ library
default-extensions: NoImplicitPrelude
build-depends:
base >=4.7 && <5
, cryptonite
, directory
, directory-tree
, exceptions
, filepath
, formatting
......@@ -47,6 +49,7 @@ library
, shelly
, syb
, template-haskell
, temporary
, text
default-language: Haskell2010
......@@ -61,7 +64,9 @@ test-suite iterbuild-test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
, cryptonite
, directory
, directory-tree
, exceptions
, filepath
, formatting
......@@ -71,5 +76,6 @@ test-suite iterbuild-test
, shelly
, syb
, template-haskell
, temporary
, text
default-language: Haskell2010
......@@ -24,6 +24,9 @@ dependencies:
- protolude
- directory
- filepath
- temporary
- directory-tree
- cryptonite
- text
- path
- formatting
......
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE FlexibleInstances #-}
......@@ -9,10 +11,13 @@ module Capabilities where
import Protolude hiding (writeFile, readFile, (%))
import System.Directory
import System.FilePath (combine)
import System.Directory.Tree
import System.FilePath (combine, addExtension)
import qualified System.IO
import System.IO.Temp
import Path
import Formatting
import Shelly (shelly, escaping, run)
import Formatting hiding (build)
import Shelly hiding ((</>), find)
import Data.Text.Lazy (splitOn)
import Data.Text.Lazy.IO
......@@ -21,6 +26,28 @@ import Control.Monad.Catch
import Expr
-- We will a function for hashing directories in the format of the directory-tree library
-- The following function is copied from https://github.com/jberryman/directory-tree
-- HELPER: a non-recursive comparison
comparingConstr :: DirTree a -> DirTree a1 -> Ordering
comparingConstr (Failed _ _) (Dir _ _) = LT
comparingConstr (Failed _ _) (File _ _) = LT
comparingConstr (File _ _) (Failed _ _) = GT
comparingConstr (File _ _) (Dir _ _) = GT
comparingConstr (Dir _ _) (Failed _ _) = GT
comparingConstr (Dir _ _) (File _ _) = LT
-- else compare on the names of constructors that are the same, without
-- looking at the contents of Dir constructors:
comparingConstr t t' = compare (name t) (name t')
-- We want to sort the keys in a reasonable fashion before hashing the tree.
deriving instance Generic a => Generic (DirTree a)
instance (Generic a, Hashable a) => Hashable (DirTree a) where
hashWithSalt salt (File name contents) = hashWithSalt salt (name, contents)
hashWithSalt salt (Dir name files) = hashWithSalt salt (name, sortBy comparingConstr files)
hashWithSalt salt (Failed name err) = hashWithSalt salt name
-- These are the first 2 layers of my 3 layer cake https://www.parsonsmatt.org/2018/03/22/three_layer_haskell_cake.html
-- The amount of classes here should be minimized, as it is tiresome to write that code. I love just how
-- you can use polymorphism to dump all your 'imperative' code(layer 1) into one file and then just forget about it.
......@@ -57,10 +84,15 @@ class HasTargetPath f => HasGit f where
class HasCache f where
cachePath :: f (Path Abs Dir)
downloadKaggle :: f LText -> f ()
tmpPath :: f (Path Abs Dir)
-- First argument is competition, second one is a hash. Output is the path where data was cached or a new path where
-- the data was downloaded.
fetchKaggle :: f LText -> f LText -> f (Path Rel Dir)
data Env = Env {
getTargetPath :: Path Abs Dir,
-- Setting this to /tmp may be bad because the size of /tmp may not suffice for unzipping large files
getTmpPath :: Path Abs Dir,
getGitRepoPath :: Path Abs Dir,
getCachePath :: Path Abs Dir,
getResultPath :: Path Abs Dir
......@@ -78,14 +110,28 @@ setDefaultPaths b =
<*> parseAbsDir (combine currentDir ".git/")
<*> parseAbsDir (combine currentDir "cache/")
<*> parseAbsDir (combine currentDir "results/")
<*> parseAbsDir (combine currentDir "tmp/")
runReaderT (getBaseMonad b) env
nameFile :: Hashable a => Maybe LText -> Maybe LText -> a -> LText
nameFile label extension =
maybe identity (flip $ format $ text % "." % text) extension
-- TODO: This does not produce a fixed size hash, which is stupid.
hashFunction :: Hashable a => a -> LText
hashFunction = format (base 36) . hash
formatFileName :: (MonadThrow m, Hashable a) => Maybe LText -> Maybe LText -> a -> m (Path Rel File)
formatFileName label extension =
parseRelFile
. toS
. maybe identity (flip $ format $ text % "." % text) extension
. maybe identity (format $ text % "-" % text) label
. format hex
. hash
. hashFunction
formatDirName :: (MonadThrow m, Hashable a) => Maybe LText -> a -> m (Path Rel Dir)
formatDirName label =
parseRelDir
. toS
. (<> "/")
. maybe identity (format $ text % "-" % text) label
. hashFunction
-- The more general version causes overlapping instances later on.
-- instance (MonadIO m, MonadThrow m, MonadReader Env m) => HasTargetPath m where
......@@ -96,13 +142,13 @@ instance HasTargetPath BaseMonad where
label <- label
extension <- extension
content <- content
let name = nameFile label extension content
filename <- formatFileName label extension content
targetPath <- targetPath
let path = toFilePath targetPath ++ toS name
let path = toFilePath (targetPath </> filename)
exists <- (liftIO . doesFileExist) path
bool (liftIO $ writeFile path content) (pure ()) exists
parseRelFile (toS name)
return filename
-- instance (MonadIO m, MonadThrow m, MonadReader Env m) => HasGit m where
......@@ -117,53 +163,97 @@ instance HasGit BaseMonad where
shelly $ escaping False $ run "git" ["archive", toS commit, "| tar -x -C", "git-" <> toS (toFilePath writePath)]
return pathEnd
-- Puts an external file, whose hash is yet unknown into the local database. It does not return the path where
-- it is put into to force the user to explicitly lookup the hash to ensure reproducibility.
importLocalFile :: BaseMonad (Path a File) -> BaseMonad ()
importLocalFile fileDir =
importLocalFile :: BaseMonad (Path Abs File) -> BaseMonad (Path Rel File)
importLocalFile source =
do
sourceDir <- fileDir
targetName <- setFileExtension "" . filename $ sourceDir
contents <- liftIO . readFile . toFilePath $ sourceDir
let fName = nameFile (Just . toS . toFilePath $ targetName) (Just . toS . fileExtension $ sourceDir) contents
source <- source
targetName <- setFileExtension "" . filename $ source
contents <- liftIO . readFile . toFilePath $ source
filename <- formatFileName (Just . toS . toFilePath $ targetName) (Just . toS . fileExtension $ source) contents
cachePath <- cachePath
liftIO (renameFile (toFilePath sourceDir) (combine (toFilePath cachePath) (toS fName)))
print fName
liftIO (renameFile (toFilePath source) (toFilePath (cachePath </> filename)))
return filename
newtype CacheLookupException = CacheLookupException { key :: LText} deriving (Show)
importLocalDir :: BaseMonad (Path Abs Dir) -> BaseMonad (Path Rel Dir)
importLocalDir source =
do
source <- source
directoryContents <- liftIO . readDirectoryWithL readFile . toFilePath $ source
let a = hash <$> directoryContents
contents <- liftIO . readFile . toFilePath $ source
dirname <- formatDirName (Just . toS . toFilePath . dirname $ source) contents
cachePath <- cachePath
liftIO (renameFile (toFilePath source) (toFilePath (cachePath </> dirname)))
return dirname
newtype CacheLookupException = CacheLookupException { key :: LText } deriving (Show)
instance Exception CacheLookupException
-- This is used when you want to guarantee your data having some hash like when the data was imported with
-- importLocalFile. This is a nice solution for dealing with outside data and still having reproducibility
-- guarantees.
lookupHash :: BaseMonad LText -> BaseMonad (Path Rel File)
lookupHash hash =
findCached_ :: (FilePath -> IO Bool) -> BaseMonad LText -> BaseMonad FilePath
findCached_ predicate hash =
do
cachePath <- cachePath
hash <- hash
filePaths <- fmap (fmap toS) . liftIO . listDirectory . toFilePath $ cachePath
filepath <- case find ((== Just hash) . head . splitOn "-") filePaths of
allpaths <- liftIO . listDirectory . toFilePath $ cachePath
filtered <- liftIO . foldlM (\acc next -> ifM (predicate next) (return (next:acc)) (return acc)) [] $ allpaths
let paths = fmap toS filtered
filepath <- case find ((== Just hash) . head . splitOn "-") paths of
Just x -> return x
Nothing -> throwM (CacheLookupException hash)
parseRelFile (toS filepath)
return (toS filepath)
findCachedFile :: BaseMonad LText -> BaseMonad (Path Rel File)
findCachedFile hash = findCached_ doesFileExist hash >>= parseRelFile
findCachedDir :: BaseMonad LText -> BaseMonad (Path Rel Dir)
findCachedDir hash = findCached_ doesDirectoryExist hash >>= parseRelDir
fetchKaggle_ :: BaseMonad LText -> BaseMonad (Path Rel Dir)
fetchKaggle_ competition =
do
competition <- competition
shelly . verbosely . print_stdout True . onCommandHandles (initOutputHandles (`System.IO.hSetBinaryMode` True)) $
run "kaggle" ["competitions", "download", "-c", toS competition, "-p", "/tmp/"]
let dir = ((++ "/") . toS) competition
shelly $ run "unzip" [toS . combine "/tmp" . (`addExtension` "zip") . toS $ competition, "-d", toS . combine "/tmp" $ dir]
cachePath <- cachePath
liftIO $ renameDirectory (combine "/tmp" dir) (combine (toFilePath cachePath) dir)
-- importLocalFile (parseAbsFile $ combine "/tmp" (toS competition))
undefined
instance HasCache BaseMonad where
cachePath = asks getCachePath
-- it is quite unfortunate that kaggle provides no checksums
-- OHH but it apparently checks whether the data was already downloadeded, so I could maybe write directly into cache.
downloadKaggle competition =
do
competition <- competition
shelly $ run "kaggle" ["competitions", "download", "-c", toS competition, "-p", "/tmp/"]
-- TODO: extract
let dir = ((++ "/") . toS) competition
cachePath <- cachePath
liftIO $ renameDirectory (combine "/tmp" dir) (combine (toFilePath cachePath) dir)
importLocalFile (parseAbsFile $ combine "/tmp" (toS competition))
tmpPath = asks getTmpPath
fetchKaggle = undefined
downloadKaggle competition =
do
competition <- competition
shelly . verbosely . print_stdout True . onCommandHandles (initOutputHandles (`System.IO.hSetBinaryMode` True)) $
run "kaggle" ["competitions", "download", "-c", toS competition, "-p", "/tmp/"]
let dir = ((++ "/") . toS) competition
shelly $ run "unzip" [toS . combine "/tmp" . (`addExtension` "zip") . toS $ competition, "-d", toS . combine "/tmp" $ dir]
cachePath <- cachePath
liftIO $ renameDirectory (combine "/tmp" dir) (combine (toFilePath cachePath) dir)
importLocalFile (parseAbsFile $ combine "/tmp" (toS competition))
-- What the fuck, why do I need to write this trivial code? There surely is a better
......
......@@ -51,7 +51,7 @@ cachedApplyCode cachePath obj args kwargs =
in let
objects = args ++ (snd <$> kwargs) ++ [obj]
in let
objCachePath = toFilePath cachePath </> (toS . format hex . hash . concatMap (toFilePath . pyModule . location) $ objects)
objCachePath = toFilePath cachePath </> (toS . format (base 36) . hash . concatMap (toFilePath . pyModule . location) $ objects)
in
(uncurry (format ("from " % text % " import " % text))
. (toImportPath . location &&& identifier)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment