Commit 7826f6c1 authored by Anders Sejr Hansen's avatar Anders Sejr Hansen

Add HMM classification code

parent 988916c3
% Batch_vbSPT_classify.m
% written by Anders Sejr Hansen (AndersSejrHansen@post.harvard.edu;
% @Anders_S_Hansen; https://anderssejrhansen.wordpress.com)
% License: GNU GPL v3
% Dependent functions:
% - vbSPT-1.1.3 and all associated functions.m
% - InferFrameRateFromName.m
% - EditRunInputFile_for_batch.m
clear; clc; close all;
% DESCRIPTION
% This file takes as input a list of MAT-files with SPT data, converts
% them to the vbSPT readable format and then classifies them according to
% the vbSPT protocol
% vbSPT is a sophisticated HMM-implementation for SPT data. Please see
% the citation below for full details on vbSPT:
% Person F,Lindn M, Unoson C, Elf J, Extracting intracellular reaction rates from single molecule tracking data, Nature Methods 10, 265?269 (2013). doi:10.1038/nmeth.2367
%%%%%%%%%%%%%%%%%%%%%%%% PREAMBLE %%%%%%%%%%%%%%%%%%%%%%%%
% add path for dependent functions:
addpath('Functions');
% Add paths that are needed to run the VB3 analysis
dir0= ['.', filesep, 'Functions', filesep, 'HMM_classification', filesep, 'vbSPT-1.1.3'];
addpath(genpath([dir0 filesep '.' filesep 'VB3']))
addpath(genpath([dir0 filesep '.' filesep 'HMMcore']))
addpath(genpath([dir0 filesep '.' filesep 'Tools']))
addpath(genpath([dir0 filesep '.' filesep 'external']))
addpath([dir0 filesep '.'])
disp('Added local vbSPT paths');
disp('------------------------------------');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEFINE KEY PARAMETERS
MinTrajLength = 2;
input_path = ['.', filesep, 'QC_data', filesep];
path_reformatted = ['.', filesep, 'QC_data_reformatted', filesep];
path_classified = ['.', filesep, 'HMM_first_QC_data', filesep];
maxHidden = 2;
%%%%%%%%%%%%%%%%%%%%%%%% STEP 1: DEFINE DATASETS %%%%%%%%%%%%%%%%%%%%%%%%%%
% run on all MAT-files in the input_path directory
MAT_files=dir([input_path,'*.mat']);
FileNames = ''; %for saving the actual file name
for iter = 1:length(MAT_files)
% get the relevant file name
Filenames{iter} = MAT_files(iter).name(1:end-4);
end
%%%%%%%%%%%%%%%%%%% STEP 2: REFORMAT TO vbSPT FORMAT %%%%%%%%%%%%%%%%%%%%
% loop over all MAT_files
% convert to vbSPT format
% save file in vbSPT format
disp(['Reformatting ', num2str(length(Filenames)), ' datasets to the vbSPT format']); tic;
for FileIter = 1:length(Filenames)
disp(['working on file ', num2str(FileIter), ' of ', num2str(length(Filenames)), ' total files']);
% load the data
clear trackedPar_QC CellTracks
load([input_path, Filenames{FileIter}, '.mat']);
% CONVERT TO vbSPT FORMAT
% make a cell array of trajectpries
CellTracks = cell(1);
iter = 1;
% REMOVE GAPS FROM TRAJECTORIES: vbSPT cannot handle gaps
for i = 1:length(trackedPar_QC)
if size(trackedPar_QC(i).xy,1) >= MinTrajLength
%Now slice up trajectory to get rid of gaps
currFrame = trackedPar_QC(i).Frame;
currTrack = trackedPar_QC(i).xy;
%check for gaps - expect a difference of 1 between frames if there
%are no gaps, so search for difference of 2:
Idx = find(diff(currFrame)==2);
%If there were no gaps, just save:
if isempty(Idx)
CellTracks{iter} = currTrack;
iter = iter + 1;
else
%Modify Idx:
Idx_slice = [0 Idx length(currFrame)];
%So there are gaps:
%Slice up trajectories if they satisfy MinTrajLength
for j=2:length(Idx_slice)
if size(currTrack(Idx_slice(j-1)+1:Idx_slice(j),:),1) >= MinTrajLength
CellTracks{iter} = currTrack(Idx_slice(j-1)+1:Idx_slice(j),:);
iter = iter + 1;
end
end
end
end
end
clear currFrame currTrack iter i Idx Idx_slice
% OK now you have a cell array in the vbSPT format, so now you can save
% the file, but first get the frame rate
LagTime = InferFrameRateFromName( Filenames{FileIter} );
% SAVE TRAJECTORIES IN THE vbSPT REFORMATTED FORM
% save the reformatted dataset
save([path_reformatted,Filenames{FileIter}, '_reformatted.mat'], 'CellTracks', 'LagTime');
end
toc;
%%%%%%%%%%%%%%%%% STEP 3: PERFORM vbSPT CLASSIFICATION %%%%%%%%%%%%%%%%%%%%
% loop over all MAT_files
% edit the "vbSPT_RunInputFileBatch" according to files names, paths and
% frame rates
% Perform vbSPT classification
% save the output
disp(['Performing vbSPT classification of ', num2str(length(Filenames)), ' datasets']);
for FileIter = 1:length(Filenames)
disp(['working on HMM-classification of file number ', num2str(FileIter), ' of ', num2str(length(Filenames)), ' total files']);
% load the data
clear LagTime CellTracks CellTrackViterbiClass
load([path_reformatted,Filenames{FileIter}, '_reformatted.mat']);
% UPDATE 2017-10-09
% I am getting a lot of weird conflicts, where iteration n, get's parts
% of its information from the RunInputFile from the previous run. So to
% deal with this, make a new RunInputFile for each run.
% Copy the master-file to directory 'FolderWithRunInputFiles' and then
% give each file a new number.
% first see if the file already exists:
if exist(['./FolderWithRunInputFiles/vbSPT_RunInputFileBatch', num2str(FileIter), '.m'])
delete(['./FolderWithRunInputFiles/vbSPT_RunInputFileBatch', num2str(FileIter), '.m']);
end
% now edit the RunInputFileBatch
% EDIT THE RunInputFile
inputfile = [path_reformatted,Filenames{FileIter}, '_reformatted.mat'];
outputfile = [path_classified,Filenames{FileIter}, '_classified.mat'];
% now edit the RunInputFile using a function
Finish = EditRunInputFile_for_batch( 'vbSPT_RunInputFileBatch.m', inputfile, outputfile, LagTime );
% now copy it to the directory:
copyfile('vbSPT_RunInputFileBatch.m', ['./FolderWithRunInputFiles/vbSPT_RunInputFileBatch', num2str(FileIter), '.m']);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%% Run vbSPT on the data %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
R=VB3_HMManalysis(['./FolderWithRunInputFiles/vbSPT_RunInputFileBatch', num2str(FileIter), '.m']);
%Now distill out the key things:
CellTrackViterbiClass = R.Wbest.est2.viterbi;
%Display the output:
disp('====================================================');
%VB3_getResult('vbSPT_RunInputFileBatch');
% save the key vbSPT results to a metadata structure array
vbSPT_metadata = struct();
vbSPT_metadata.NumberOfStates = R.Wbest.N;
vbSPT_metadata.DiffusionConstants = R.Wbest.est.DdtMean/LagTime;
vbSPT_metadata.Occupancy = R.Wbest.est.Ptot;
vbSPT_metadata.TransitionMatrix = R.Wbest.est.Amean;
vbSPT_metadata.LagTime = LagTime;
% re-load CellTracks:
load([path_reformatted,Filenames{FileIter}, '_reformatted.mat'], 'CellTracks');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%% SAVE THE DATA %%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
save([path_classified,Filenames{FileIter}, '_classified.mat'], 'CellTracks', 'CellTrackViterbiClass', 'vbSPT_metadata', 'LagTime');
end
% Anders Sejr Hansen, October 2017
% Input fule for vbSPT to run in batch mode
% To run the HMM analysis manually type:
% >> VB3_HMManalysis('runinputfilename')
% Inputs
inputfile = './QC_data_reformatted/U2OS_C32_Halo-hCTCF_133Hz_pooled_QC_CD2_reformatted.mat';
trajectoryfield = 'CellTracks';
% Computing strategy
parallelize_config = 1;
parallel_start = 'theVBpool=gcp'; % executed before the parallelizable loop.
parallel_end = 'delete(theVBpool)'; % executed after the parallelizable loop.
% Saving options
outputfile = './HMM_first_QC_data/U2OS_C32_Halo-hCTCF_133Hz_pooled_QC_CD2_classified.mat';
jobID = 'Anders: vbSPT job is running';
% Data properties
timestep = 0.0075188; %timesep in [s]
dim = 2;
trjLmin = 2;
% Convergence and computation alternatives
runs = 3;
maxHidden = 2;
% Evaluate extra estimates including Viterbi paths
stateEstimate = 1; %set to 1 if you want each tracjectory classified
maxIter = []; % maximum number of VB iterations ([]: use default values).
relTolF = 1e-8; % convergence criterion for relative change in likelihood bound.
tolPar = []; % convergence criterion for M-step parameters (leave non-strict).
% Bootstrapping
bootstrapNum = 10;
fullBootstrap = 0;
% Limits for initial conditions
init_D = [0.001, 16]; % interval for diffusion constant initial guess [length^2/time] in same length units as the input data.
init_tD = [2, 20]*timestep; % interval for mean dwell time initial guess in [s].
% It is recommended to keep the initial tD guesses on the lower end of the expected spectrum.
% Prior distributions
% The priors are generated by taking the geometrical average of the initial guess range.
% units: same time units as timestep, same length unit as input data.
prior_type_D = 'mean_strength';
prior_D = 1; % prior diffusion constant [length^2/time] in same length units as the input data.
prior_Dstrength = 5; % strength of diffusion constant prior, number of pseudocounts (positive).
%% default prior choices (according nat. meth. 2013 paper)
prior_type_Pi = 'natmet13';
prior_piStrength = 5; % prior strength of initial state distribution (assumed uniform) in pseudocounts.
prior_type_A = 'natmet13';
prior_tD = 10*timestep; % prior dwell time in [s].
prior_tDstrength = 2*prior_tD/timestep; % transition rate strength (number of pseudocounts). Recommended to be at least 2*prior_tD/timestep.
%% new prior choices (for advanced users, as they are not yet systematically tested)
%prior_type_Pi = 'flat';
%prior_type_A = 'dwell_Bflat';
%prior_tD = 10*timestep; % prior dwell time in [s]. Must be greater than timestep (recommended > 2*timestep)
%prior_tDstd = 100*prior_tD; % standard deviation of prior dwell times [s].