Skip to content

Commit

Permalink
ENH Add early check for column headers in count()
Browse files Browse the repository at this point in the history
This also lead to a refactoring of the internal transformation machinery
which should make adding similar checks (or even a generic mechanism)
much simpler in the future.
  • Loading branch information
luispedro committed Jul 27, 2019
1 parent ccbfa11 commit 7f92dd6
Show file tree
Hide file tree
Showing 10 changed files with 133 additions and 33 deletions.
1 change: 1 addition & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Version 1.0.1+
* Call bwa mem so that it behaves in a deterministic way (independently of
the number of threads used)
* Add `include_fragments` option to orf_find
* Add early check for column headers in `count()`

Version 1.0.1 2019-07-05 by luispedro
* Fix bug with external modules and multiple fastQ inputs
Expand Down
6 changes: 6 additions & 0 deletions NGLess/BuiltinFunctions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ builtinFunctions =
,Function (FuncName "mapstats") (Just NGLMappedReadSet) [] NGLCounts mapStatsArgs False []
,Function (FuncName "select") (Just NGLMappedReadSet) [] NGLMappedReadSet selectArgs False []
,Function (FuncName "count") (Just NGLMappedReadSet) [] NGLCounts countArgs False [FunctionCheckReturnAssigned]
,Function (FuncName "__check_count") (Just NGLMappedReadSet) [] NGLCounts countCheckArgs False []
,Function (FuncName "countfile") (Just NGLString) [ArgCheckFileReadable] NGLCounts [] False [FunctionCheckReturnAssigned]
,Function (FuncName "write") (Just NGLAny) [] NGLVoid writeArgs False []
,Function (FuncName "print") (Just NGLAny) [] NGLVoid [] False []
Expand Down Expand Up @@ -77,6 +78,11 @@ countArgs =
,ArgInformation "reference" False NGLString [ArgCheckMinVersion (0,8)]
]

countCheckArgs = countArgs ++
[ArgInformation "original_lno" False NGLInteger []
]


selectArgs =
[ArgInformation "keep_if" False (NGList NGLSymbol) [ArgCheckSymbol ["mapped", "unmapped", "unique"]]
,ArgInformation "drop_if" False (NGList NGLSymbol) [ArgCheckSymbol ["mapped", "unmapped", "unique"]]
Expand Down
4 changes: 3 additions & 1 deletion NGLess/Interpret.hs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ import NGLess.NGLEnvironment
import NGLess.NGError

import Interpretation.Map
import Interpretation.Count (executeCount)
import Interpretation.Count (executeCount, executeCountCheck)
import Interpretation.CountFile (executeCountFile)
import Interpretation.FastQ
import Interpretation.Write
Expand Down Expand Up @@ -273,6 +273,7 @@ interpretExpr (Lookup _ (Variable v)) = lookupVariable v >>= \case
Just r' -> return r'
interpretExpr (BuiltinConstant (Variable "STDIN")) = return (NGOString "/dev/stdin")
interpretExpr (BuiltinConstant (Variable "STDOUT")) = return (NGOString "/dev/stdout")
interpretExpr (BuiltinConstant (Variable "__VOID")) = return NGOVoid
interpretExpr (BuiltinConstant (Variable v)) = throwShouldNotOccur ("Unknown builtin constant '" ++ show v ++ "': it should not have been accepted.")
interpretExpr (ConstStr t) = return (NGOString t)
interpretExpr (ConstBool b) = return (NGOBool b)
Expand Down Expand Up @@ -330,6 +331,7 @@ interpretFunction' (FuncName "mapstats") expr args Nothing = runNGLessIO (execu
interpretFunction' (FuncName "__merge_samfiles") expr args Nothing = runNGLessIO (executeMergeSams expr args)
interpretFunction' (FuncName "select") expr args Nothing = runNGLessIO (executeSelect expr args)
interpretFunction' (FuncName "count") expr args Nothing = runNGLessIO (executeCount expr args)
interpretFunction' (FuncName "__check_count") expr args Nothing = runNGLessIO (executeCountCheck expr args)
interpretFunction' (FuncName "countfile") expr args Nothing = runNGLessIO (executeCountFile expr args)
interpretFunction' (FuncName "print") expr args Nothing = executePrint expr args
interpretFunction' (FuncName "paired") mate1 args Nothing = runNGLessIO (executePaired mate1 args)
Expand Down
88 changes: 59 additions & 29 deletions NGLess/Interpretation/Count.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
{-# OPTIONS_GHC -fno-full-laziness #-}
module Interpretation.Count
( executeCount
, executeCountCheck
, Annotator(..)
, CountOpts(..)
, AnnotationMode(..)
Expand Down Expand Up @@ -304,6 +305,32 @@ executeCount (NGOMappedReadSet rname istream mappedref) args = do
NGOCounts . File <$> performCount istream rname annotators opts
executeCount err _ = throwScriptError ("Invalid Type. Should be used NGOList or NGOAnnotatedSet but type was: " ++ show err)

executeCountCheck :: NGLessObject -> KwArgsValues -> NGLessIO NGLessObject
executeCountCheck _ kwargs = do
opts <- parseOptions Nothing kwargs
lno <- lookupIntegerOrScriptErrorDef (return 0) "hidden lno argument" "original_lno" kwargs
case optAnnotationMode opts of
AnnotateFunctionalMap fname -> do
columns <- C.runConduit $
conduitPossiblyCompressedFile fname
.| linesC
.| CAlg.enumerateC
.| (lastCommentOrHeader fname True >>= \case
Nothing -> return []
Just (_,ByteLine line) -> return $ B8.split '\t' line)
let missing = [f | f <- optFeatures opts, f `notElem` columns]
case missing of
[] -> return ()
ms -> do
let errormsg = [
"In call to count() [line ",
show lno,
"], missing features:"
] ++ concat [[" ", B8.unpack f] | f <- ms]
throwDataError (concat errormsg)
_ -> return ()
return NGOVoid

-- | The include_minus1 argument defaulted to False up to version 0.5. Now, it
-- defaults to true as it seems to be what most users expect.
defaultMinus1 :: NGLessIO Bool
Expand Down Expand Up @@ -557,6 +584,37 @@ normalizeCounts nmethod counts sizes
liftIO $ forM_ [1.. VUM.length counts - 1] (VUM.unsafeModify counts (* factor))
| otherwise = error "This should be unreachable code [normalizeCounts]"



-- lastCommentOrHeader :: Monad m => C.ConduitT (Int, ByteLine) () m (Maybe (Int, ByteLine))
lastCommentOrHeader fname newAPI = C.await >>= \case
Nothing -> return Nothing
Just f@(_,ByteLine line) ->
if isComment line
then lastCommentOrHeader' (Just f)
else return (Just f)
where
lastCommentOrHeader' prev = C.await >>= \case
Nothing -> return prev
Just f@(_, ByteLine line)
| isComment line ->
if newAPI
then lastCommentOrHeader' (Just f)
else do
lift $ outputListLno' WarningOutput versionChangeWarning
C.leftover f
return (Just f)
| otherwise -> do
C.leftover f
return prev
isComment line
| B.null line = True
| otherwise = B8.head line == '#'
versionChangeWarning =
["Loading '", fname, "': found several lines at the top starting with '#'.\n",
"The interpretation of these changed in NGLess 1.1 (they are now considered comment lines).\n",
"Using the older version for backwards compatibility.\n"]

{- This object keeps the state for iterating over the lines in the annotation
- file.
-}
Expand All @@ -583,7 +641,7 @@ loadFunctionalMap fname columns = do
.| linesC
.| CAlg.enumerateC
.| (do
hline <- lastCommentOrHeader (v >= NGLVersion 1 1)
hline <- lastCommentOrHeader fname (v >= NGLVersion 1 1)
(cis,tags) <- case hline of
Nothing -> throwDataError ("Empty map file: "++fname)
Just (_, ByteLine header) -> let headers = B8.split '\t' header
Expand All @@ -596,34 +654,6 @@ loadFunctionalMap fname columns = do
outputListLno' TraceOutput ["Loading of map file '", fname, "' complete"]
return $! sortOn (\(GeneMapAnnotator tag _ _) -> tag) anns
where
-- lastCommentOrHeader :: Monad m => C.ConduitT (Int, ByteLine) () m (Maybe (Int, ByteLine))
lastCommentOrHeader newAPI = C.await >>= \case
Nothing -> return Nothing
Just f@(_,ByteLine line) ->
if isComment line
then lastCommentOrHeader' (Just f)
else return (Just f)
where
lastCommentOrHeader' prev = C.await >>= \case
Nothing -> return prev
Just f@(_, ByteLine line)
| isComment line ->
if newAPI
then lastCommentOrHeader' (Just f)
else do
lift $ outputListLno' WarningOutput versionChangeWarning
C.leftover f
return (Just f)
| otherwise -> do
C.leftover f
return prev
isComment line
| B.null line = True
| otherwise = B8.head line == '#'
versionChangeWarning =
["Loading '", fname, "': found several lines at the top starting with '#'.\n",
"The interpretation of these changed in NGLess 1.1 (they are now considered comment lines).\n",
"Using the older version for backwards compatibility.\n"]

finishFunctionalMap :: B.ByteString -> LoadFunctionalMapState -> Annotator
finishFunctionalMap tag (LoadFunctionalMapState _ gmap namemap) = GeneMapAnnotator
Expand Down
43 changes: 40 additions & 3 deletions NGLess/Transform.hs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ transform mods sc = Script (nglHeader sc) <$> applyM transforms (nglBody sc)
, addFileChecks
, addIndexChecks
, addUseNewer
, addCountsCheck
]

pureRecursiveTransform :: (Expression -> Expression) -> Expression -> Expression
Expand Down Expand Up @@ -226,6 +227,7 @@ substrimReassign = pureTransform $ \case
addFileChecks :: [(Int,Expression)] -> NGLessIO [(Int, Expression)]
-- this is easier to do on the reversed script
addFileChecks sc = reverse <$> (checkIFiles (reverse sc) >>= checkOFiles)
-- convert to genericCheckUpfloat
where
-- This could be combined into a single pass
-- For script preprocessing, we generally disregard performance, however
Expand Down Expand Up @@ -301,7 +303,7 @@ addFileChecks' checkFname tag ((lno,e):rest) = do
--
-- write(input, ofile="output/"+variable+".sam")
addIndexChecks :: [(Int,Expression)] -> NGLessIO [(Int, Expression)]
addIndexChecks sc = reverse <$> addIndexChecks' (reverse sc)
addIndexChecks sc = reverse <$> addIndexChecks' (reverse sc) -- TODO: convert to genericCheckUpfloat
addIndexChecks' :: [(Int,Expression)] -> NGLessIO [(Int, Expression)]
addIndexChecks' [] = return []
addIndexChecks' ((lno,e):rest) = do
Expand All @@ -312,8 +314,6 @@ addIndexChecks' ((lno,e):rest) = do
return ((lno,e):rest')

where
-- The similarity of this code and the code for addFileChecks hints at
-- a possible merging with a good abstraction
maybeAddChecks :: [(Variable,Expression)] -> [(Int, Expression)] -> [(Int, Expression)]
maybeAddChecks [(v,ix)] [] = [(0, indexCheckExpr v ix)]
maybeAddChecks vars@[(v,ix)] ((lno',e'):rest') = case e' of
Expand All @@ -330,6 +330,31 @@ addIndexChecks' ((lno,e):rest) = do
Nothing


genericCheckUpfloat :: ((Int, Expression) -> Maybe ([Variable],Expression))
-> [(Int, Expression)]
-> NGLessIO [(Int, Expression)]
genericCheckUpfloat f exprs = reverse <$> genericCheckUpfloat' f (reverse exprs)
genericCheckUpfloat' _ [] = return []
genericCheckUpfloat' f (c:rest) = do
let rest' = case recursiveCall f c of
Nothing -> rest
Just (vars, ne) -> floatDown vars (fst c,ne) rest
rest'' <- genericCheckUpfloat' f rest'
return (c:rest'')

recursiveCall :: ((Int, Expression) -> Maybe a) -> (Int, Expression) -> Maybe a
recursiveCall f (lno, e) = evalCont $ callCC $ \exit -> do
flip recursiveAnalyse e (\sub -> case f (lno, sub) of
Nothing -> return ()
j -> exit j)
return Nothing

floatDown :: [Variable] -> (Int, Expression) -> [(Int, Expression)] -> [(Int, Expression)]
floatDown _ e [] = [e]
floatDown vars e (c:rest)
| any (`isVarUsed1` (snd c)) vars = (e:c:rest)
| otherwise = (c:floatDown vars e rest)

-- | Implements addition of temp$nn variables to simplify expressions
--
-- This allows the rest of the code to be simpler
Expand Down Expand Up @@ -478,3 +503,15 @@ addUseNewer exprs = do
mapM (secondM addUseNewer') exprs


addCountsCheck :: [(Int, Expression)] -> NGLessIO [(Int, Expression)]
addCountsCheck = genericCheckUpfloat countCheck
where
countCheck (lno, FunctionCall (FuncName "count") _ kwargs Nothing) = Just (extractVars kwargs, buildCheck lno kwargs)
countCheck _ = Nothing
buildCheck lno kwargs =
FunctionCall
(FuncName "__check_count")
(BuiltinConstant (Variable "__VOID"))
((Variable "original_lno", ConstInt (toInteger lno)):kwargs)
Nothing
extractVars kwargs = concat (usedVariables . snd <$> kwargs)
4 changes: 4 additions & 0 deletions docs/sources/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ User-visible improvements
<https://ngless.embl.de/Functions.html#count>`__ to accept multiple comment
lines at the top of the file as produced by `eggnog-mapper
<http://eggnog-mapper.embl.de/>`__.
- Faster check for column headers in ``functional_map`` argument to `count()
<https://ngless.embl.de/Functions.html#count>`__ function: now it is
performed *as soon as possible* (including at the top of the script if the
arguments are literal strings), thus NGLess can fail faster.


Version 1.0.1
Expand Down
1 change: 1 addition & 0 deletions tests/error-map-file/basic.sam
7 changes: 7 additions & 0 deletions tests/error-map-file/check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

if test -f should.not.be.created.txt; then
exit 1
else
exit 0
fi
1 change: 1 addition & 0 deletions tests/error-map-file/functional.map
11 changes: 11 additions & 0 deletions tests/error-map-file/non-existent-col.ngl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ngless '1.1'
mapped = samfile('basic.sam')

# The first count() function is correct, but the second one contains an error
# ("kot" is a non-existent)

counted = count(mapped, features=['ko'], multiple={all1}, functional_map='functional.map')
write(counted, ofile='should.not.be.created.txt')

counted = count(mapped, features=['kot'], multiple={all1}, functional_map='functional.map')
write(counted, ofile='error.txt')

0 comments on commit 7f92dd6

Please sign in to comment.