Skip to content

Json exporting #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
EllipticalSliceSampling = "cad2338a-1db2-11e9-3401-43bc07c9ede2"
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down Expand Up @@ -43,6 +44,7 @@ Distributions = "0.25"
EllipticalSliceSampling = "2.0.0"
FASTX = "2"
Interpolations = "0.15"
JSON = "0.21.4"
KrylovKit = "0.9"
LaTeXStrings = "1"
MCMCChains = "6"
Expand Down
6 changes: 4 additions & 2 deletions src/CodonMolecularEvolution.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module CodonMolecularEvolution

using FASTX, MolecularEvolution, StatsBase, Distributions, DataFrames, CSV, NLopt, ParameterHandling, LinearAlgebra, LaTeXStrings, Random
using NNlib, Distributions,SimpleUnPack, AbstractMCMC, Interpolations, MCMCChains
using FASTX, MolecularEvolution, StatsBase, Distributions, DataFrames, CSV, JSON, NLopt, ParameterHandling, LinearAlgebra, LaTeXStrings, Random
using NNlib, Distributions, SimpleUnPack, AbstractMCMC, Interpolations, MCMCChains
using PDMats, BenchmarkTools
using EllipticalSliceSampling
using KrylovKit
Expand All @@ -10,6 +10,7 @@ struct PlotsExtDummy end

include("shared/shared.jl")
include("shared/emptyplot.jl")
include("shared/dNdS2JSON.jl")
include("difFUBAR/difFUBAR.jl")
include("difFUBAR/grids.jl")
include("../test/benchmark_difFUBAR.jl")
Expand All @@ -19,6 +20,7 @@ include("simulations/alphabeta/alphabeta.jl")
include("simulations/ou_hb.jl")
include("FUBAR/gaussianFUBAR.jl")
include("FUBAR/grid_utilities.jl")

export
difFUBARBaseline,
difFUBARParallel,
Expand Down
2 changes: 1 addition & 1 deletion src/FUBAR/FUBAR.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ end
function alphabetagrid(seqnames::Vector{String}, seqs, treestring::String;
verbosity=1, code=MolecularEvolution.universal_code, optimize_branch_lengths=false)
tree = FUBAR_init(treestring, verbosity=verbosity)
tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit_2steps(seqnames, seqs, tree, x -> x, code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit_2steps(seqnames, seqs, tree, x -> x, code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
return FUBAR_grid(tree, GTRmat, F3x4_freqs, code, verbosity=verbosity)
end
function init_path(analysis_name)
Expand Down
32 changes: 20 additions & 12 deletions src/difFUBAR/difFUBAR.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ function difFUBAR_global_fit(seqnames, seqs, tree, leaf_name_transform, code; ve

verbosity > 0 && println("Step 2: Optimizing global codon model parameters.")

tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = optimize_MG94_F3x4(seqnames, seqs, tree, leaf_name_transform=leaf_name_transform, genetic_code=code)
tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = optimize_MG94_F3x4(seqnames, seqs, tree, leaf_name_transform=leaf_name_transform, genetic_code=code)

######
#optionally polish branch lengths and topology
######

return tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs
return tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs
end

function difFUBAR_global_fit_2steps(seqnames, seqs, tree, leaf_name_transform, code; verbosity=1, optimize_branch_lengths=false)
Expand All @@ -113,20 +113,21 @@ function difFUBAR_global_fit_2steps(seqnames, seqs, tree, leaf_name_transform, c
end

GTRmat = reversibleQ(nuc_mu, ones(4))
tree, alpha, beta, F3x4_freqs, eq_freqs = optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat, leaf_name_transform=leaf_name_transform, genetic_code=code, verbosity=verbosity)
tree, LL,alpha, beta, F3x4_freqs, eq_freqs = optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat, leaf_name_transform=leaf_name_transform, genetic_code=code, verbosity=verbosity)

rescale_branchlengths!(tree, alpha) #rescale such that the ML value of alpha is 1

######
#optionally polish branch lengths and topology
######

return tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs
return tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs
end

#foreground_grid and background_grid control the number of categories below 1.0
function difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4, version::Union{difFUBARGrid,Nothing}=nothing, t=0)

shallow_tree = copy_tree(tree, true)
log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, background_omega_grid, param_kinds, is_background, num_groups, num_sites = gridprep(tree, tags;
verbosity=verbosity,
foreground_grid=foreground_grid,
Expand All @@ -140,12 +141,12 @@ function difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foregr
verbosity=verbosity,
foreground_grid=foreground_grid,
background_grid=background_grid
)
)..., shallow_tree # In case of future tree surgery that may prune away nodes
end

function difFUBAR_sample(con_lik_matrix, iters; verbosity=1)
function difFUBAR_sample(con_lik_matrix, iters; burnin::Int=div(iters, 5), concentration=0.1, verbosity=1)
verbosity > 0 && println("Step 4: Running Gibbs sampler to infer site categories.")
alloc_grid, theta = LDA_gibbs_track_allocation_vec(con_lik_matrix, 0.1, iters=iters)
alloc_grid, theta = LDA_gibbs_track_allocation_vec(con_lik_matrix, concentration, iters=iters, burnin=burnin)
return alloc_grid, theta
end

Expand Down Expand Up @@ -291,12 +292,15 @@ Consistent with the docs of [`difFUBAR_tabulate_and_plot`](@ref), `results_tuple
- `tag_colors=DIFFUBAR_TAG_COLORS[sortperm(tags)]`: vector of tag colors (hex format). The default option is consistent with the difFUBAR paper (Foreground 1: red, Foreground 2: blue).
- `pos_thresh=0.95`: threshold of significance for the posteriors.
- `iters=2500`: iterations used in the Gibbs sampler.
- `burnin=div(iters, 5)`: burnin used in the Gibbs sampler.
- `concentration=0.1`: concentration parameter used for the Dirichlet prior.
- `binarize=false`: if true, the tree is binarized before the analysis.
- `verbosity=1`: as verbosity increases, prints are added accumulatively.
- 0 - no prints
- 1 - show current step and where output files are exported
- 2 - show the chosen `difFUBAR_grid` version and amount of parallel threads.
- `exports=true`: if true, output files are exported.
- `exports2json=false`: if true, the results are exported to a JSON file (HyPhy format).
- `code=MolecularEvolution.universal_code`: genetic code used for the analysis.
- `optimize_branch_lengths=false`: if true, the branch lengths of the phylogenetic tree are optimized.
- `version::Union{difFUBARGrid, Nothing}=nothing`: explicitly choose the version of `difFUBAR_grid` to use. If `nothing`, the version is heuristically chosen based on the available RAM and Julia threads.
Expand All @@ -305,16 +309,20 @@ Consistent with the docs of [`difFUBAR_tabulate_and_plot`](@ref), `results_tuple
!!! note
Julia starts up with a single thread of execution, by default. See [Starting Julia with multiple threads](https://docs.julialang.org/en/v1/manual/multi-threading/#Starting-Julia-with-multiple-threads).
"""
function difFUBAR(seqnames, seqs, treestring, tags, outpath; tag_colors=DIFFUBAR_TAG_COLORS[sortperm(tags)], pos_thresh=0.95, iters=2500, binarize=false, verbosity=1, exports=true, code=MolecularEvolution.universal_code, optimize_branch_lengths=false, version::Union{difFUBARGrid,Nothing}=nothing, t=0)
function difFUBAR(seqnames, seqs, treestring, tags, outpath; tag_colors=DIFFUBAR_TAG_COLORS[sortperm(tags)], pos_thresh=0.95, iters=2500, burnin::Int=div(iters, 5), concentration=0.1, binarize=false, verbosity=1, exports=true, exports2json=false, code=MolecularEvolution.universal_code, optimize_branch_lengths=false, version::Union{difFUBARGrid,Nothing}=nothing, t=0)
total_time = @elapsed begin
analysis_name = outpath
leaf_name_transform = generate_tag_stripper(tags)
plot_collection = NamedTuple[]
tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors=tag_colors, exports=exports, verbosity=verbosity, disable_binarize=!binarize, plot_collection = plot_collection)
tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit_2steps(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code,
((tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs), fit_time) = @timed difFUBAR_global_fit_2steps(seqnames, seqs, tree, leaf_name_transform, code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
((con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _, shallow_tree), grid_time) = @timed difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code,
verbosity=verbosity, foreground_grid=6, background_grid=4, version=version, t=t)
alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity)
((alloc_grid, theta), sample_time) = @timed difFUBAR_sample(con_lik_matrix, iters, burnin=burnin, concentration=concentration, verbosity=verbosity)
df, plots_named_tuple = difFUBAR_tabulate_and_plot(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors, verbosity=verbosity, exports=exports)

# If possible, bake this into tabulate
end
json = dNdS2JSON(difFUBAR2JSON(), (outpath=analysis_name, df=df, θ=theta, posterior_mat=alloc_grid ./ sum(alloc_grid, dims=1), categories=reduce(hcat, codon_param_vec)', tags=tags, tree=shallow_tree, LL=LL, timers = (total_time, fit_time, grid_time, sample_time), treestring=treestring, seqnames=seqnames, seqs=seqs, leaf_name_transform=leaf_name_transform, pos_thresh=pos_thresh, iters=iters, burnin=burnin, concentration=concentration, binarize=binarize, exports=exports2json))
#Return df, (tuple of partial calculations needed to re-run tablulate), plots
push!(plot_collection, plots_named_tuple)
return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors), merge(plot_collection...)
Expand Down
179 changes: 179 additions & 0 deletions src/shared/dNdS2JSON.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# We assume order doesn't matter so we just use `Dict`, otherwise `DataStructures.OrderedDict` could be used.
# I assume there's only one partition (HyPhy) "0" (can be generalized later)

const SHAREDJSONFIELDS = Dict(
"analysis" => ("info", "version", "citation", "authors", "contact", "requirements"),
"input" => ("file name", "number of sequences", "number of sites", "partition count", "trees"),
"fits" => ("Log Likelihood", "estimated parameters", "AIC-c", "Rate Distributions", "Nucleotide GTR"),
"data partitions" => ("0",),
"branch attributes" => ("0",),
"tested" => nothing, # specified later
"timers" => nothing # specified later
)

#=
Must contain the field `d`
A NamedTuple with (atleast) the fields: seqs, seqnames, treestring, timers should be passed along downstream
Must contain the method `model_stages(::Type{ConcretedNdS2JSON})` (should be static)
=#
abstract type dNdS2JSON end

# Rely on input2JSON! already having been called
sites(method::dNdS2JSON) = method.d["input"]["number of sites"]

#=
FUBAR-esque dNdS methods
Let us denote a concrete type by `ConcretedNdS2JSON <: FUBAREsque2JSON`
Must contain the method `expanded_headers(::Type{ConcretedNdS2JSON})`
A NamedTuple with (atleast) the fields: df, θ, posterior_mat, categories should be passed along downstream
=#
abstract type FUBAREsque2JSON <: dNdS2JSON end

struct difFUBAR2JSON <: FUBAREsque2JSON
d::Dict{String, Any}
model_stages::Tuple{Vararg{String}}

function difFUBAR2JSON()
new(Dict{String, Any}())
#We might need this... checking with Steve
#new(Dict{String, Any}("analysis" => Dict{String, Any}(zip(SHAREDJSONFIELDS["analysis"], analysis_info(difFUBAR2JSON)))))
end
end

analysis_info(::Type{difFUBAR2JSON}) = (
"Perform a site-wise comparison of evolutionary pressure between two selected sets of branches.",
"version...",
"citation...",
"authors...",
"contact...",
"requirements..."
)

expanded_headers(::Type{difFUBAR2JSON}) = (
"The codon site of interest",
"Posterior probability of dNdS for branch group 1 > dNdS for branch group 2",
"Posterior probability of dNdS for branch group 2 > dNdS for branch group 1",
"Posterior probability of positive selection for branch group 1 at a site",
"Posterior probability of positive selection for branch group 2 at a site",
"Mean posterior synonymous substitution rate at a site",
"Mean posterior dNdS for branch group 1 at a site",
"Mean posterior dNdS for branch group 2 at a site"
)

model_stages(::Type{difFUBAR2JSON}) = (
"Overall",
"Global Fit",
"Grid Calculations",
"MCMC on grid weight allocations"
)

function dNdS2JSON!(method::dNdS2JSON, nt::NamedTuple)
shared2JSON!(method, nt)
specialized2JSON!(method, nt)
end

function dNdS2JSON(method::dNdS2JSON, nt::NamedTuple)
dNdS2JSON!(method, nt)
json = JSON.json(method.d, 2)
if nt.exports
open(nt.outpath * ".json", "w") do io
write(io, json)
end
end
return json
end

function input2JSON!(method::dNdS2JSON, nt::NamedTuple)
seqnames, seqs, treestring = nt.seqnames, nt.seqs, nt.treestring
input_info = ("<CodonMolEv difFUBAR doesn't know>", length(seqnames), Int(length(seqs[1]) / 3), 1, Dict("0" => treestring))
method.d["input"] = Dict{String, Any}(zip(SHAREDJSONFIELDS["input"], input_info))
end

# 1 partition as default/fallback
function datapartitions2JSON!(method::dNdS2JSON, nt::NamedTuple)
num_sites = sites(method)
method.d["data partitions"] = Dict{String, Any}(
"0" => Dict{String, Any}(
"name" => "default",
"sites" => collect(1:num_sites)
)
)
end

function tested2JSON!(method::dNdS2JSON, nt::NamedTuple)
method.d["tested"] = 0
end

function timers2JSON!(method::dNdS2JSON, nt::NamedTuple)
inner_dict = Dict{String, Any}()
for (i, (stage, timer)) in enumerate(zip(model_stages(typeof(method)), nt.timers))
inner_dict[stage] = Dict("timer" => timer, "order" => i-1)
end
method.d["timers"] = inner_dict
end

function shared2JSON!(method::dNdS2JSON, nt::NamedTuple)
input2JSON!(method, nt)
fits2JSON!(method, nt)
datapartitions2JSON!(method, nt)
branchattributes2JSON!(method, nt)
tested2JSON!(method, nt)
timers2JSON!(method, nt)
end

function canonical_specialized2JSON!(method::FUBAREsque2JSON, nt::NamedTuple)
df, θ, posterior_mat, categories = nt.df, nt.θ, nt.posterior_mat, nt.categories
# MLE
headers = Tuple(zip(names(df), expanded_headers(typeof(method))))
method.d["MLE"] = Dict("headers" => headers, "content" => Dict("0" => eachrow(Matrix(df))))

# Grid
method.d["grid"] = eachrow(hcat(categories, θ))

# Posterior
method.d["posterior"] = Dict("0" => Dict(zip(0:sites(method)-1, eachcol(posterior_mat))))
end

function specialized2JSON!(method::difFUBAR2JSON, nt::NamedTuple)
canonical_specialized2JSON!(method, nt)
# Settings
method.d["settings"] = Dict{String, Any}(
"chain-length" => nt.iters,
"burn-in" => nt.burnin,
"concentration" => nt.concentration,
"posterior" => nt.pos_thresh
)
end

function fits2JSON!(method::difFUBAR2JSON, nt::NamedTuple)
method.d["fits"] = Dict{String, Any}("Log Likelihood" => nt.LL)
end


function branchattributes2JSON!(method::difFUBAR2JSON, nt::NamedTuple)
# Branch attributes
node_dict = Dict{String, Any}()
for n in nodes(nt.tree)
nd = Dict{String, Any}("Nucleotide GTR" => n.branchlength)
model_ind = CodonMolecularEvolution.model_ind(n.name, nt.tags)
nd["Branch group"] = model_ind > length(nt.tags) ? "background" : string(model_ind)
if isleafnode(n)
nd["original name"] = nt.leaf_name_transform(n.name)
end
node_dict[n.name] = nd
end
# Attributes description
attr_dict = Dict{String, Any}(
"Nucleotide GTR" => Dict("attribute type" => "branch length", "display order" => 0),
"Branch group" => Dict("attribute type" => "node label", "display order" => 1),
"original name" => Dict("attribute type" => "node label", "display order" => -1)
)
method.d["branch attributes"] = Dict("0" => node_dict, "attributes" => attr_dict)
end

function tested2JSON!(method::difFUBAR2JSON, nt::NamedTuple)
tags = nt.tags
f(s::String) = ifelse(CodonMolecularEvolution.model_ind(s, tags) > length(tags), "background", "test")
n = map(x -> x.name, nodes(nt.tree))
method.d["tested"] = Dict("0" => Dict(zip(n, f.(n))))
end
13 changes: 6 additions & 7 deletions src/shared/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ end
"""
optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code)

Optimizes the MG94+F3x4 model on a tree, given a set of sequences and a tree. Returns the optimized tree, alpha, beta, nuc_matrix, F3x4, and eq_freqs.
Optimizes the MG94+F3x4 model on a tree, given a set of sequences and a tree. Returns the optimized tree, LL, alpha, beta, nuc_matrix, F3x4, and eq_freqs.
The leaf_name_transform kwarg can be used to transform the leaf names in the tree to match the seqnames.
"""
function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code)
Expand Down Expand Up @@ -241,12 +241,12 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge
lower_bounds!(opt, [-5.0 for i in 1:num_params])
upper_bounds!(opt, [5.0 for i in 1:num_params])
xtol_rel!(opt, 1e-12)
_, mini, _ = NLopt.optimize(opt, flat_initial_params)
LL, mini, _ = NLopt.optimize(opt, flat_initial_params)

final_params = unflatten(mini)

#tree, alpha, beta, nuc_matrix, F3x4, eq_freqs
return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs
#tree, LL, alpha, beta, nuc_matrix, F3x4, eq_freqs
return tree, LL, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs
end

export optimize_MG94_F3x4
Expand Down Expand Up @@ -369,7 +369,7 @@ function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_t

verbosity > 0 && println("Optimized single α,β LL=$(max_obj) with α=$(alpha) and β=$(beta).")
#tree, alpha, beta, F3x4, eq_freqs
return tree, alpha, beta, f3x4, eq_freqs
return tree, max_obj, alpha, beta, f3x4, eq_freqs
end

function rescale_branchlengths!(tree, scale)
Expand All @@ -383,15 +383,14 @@ end
##########################################

#This should be in the main repo
function LDA_gibbs_track_allocation_vec(conditionals::Array{Float64,2}, alpha::Float64; iters=10000)
function LDA_gibbs_track_allocation_vec(conditionals::Array{Float64,2}, alpha::Float64; iters=10000, burnin::Int=div(iters, 5))
grid_size, num_sites = size(conditionals)
#This should instead track an integer allocation vec per MCMC iteration - will be smaller
alloc_grid = zeros(Int64, (grid_size, num_sites))
φ = zeros(grid_size)
θ = ones(grid_size) ./ grid_size
v = zeros(grid_size)
θsum = zeros(grid_size)
burnin::Int = div(iters, 5)
for iter = 1:iters
φ .= alpha
for i = 1:num_sites
Expand Down
2 changes: 1 addition & 1 deletion src/simulations/alphabeta/alphabeta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
function alphabeta_setup(seqnames, seqs, treestring;
verbosity=1, code=MolecularEvolution.universal_code, optimize_branch_lengths=false)
tree, _ = FUBAR_init("", treestring, exports=false, verbosity=verbosity)
tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit_2steps(seqnames, seqs, tree, x -> x, code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
tree, LL, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit_2steps(seqnames, seqs, tree, x -> x, code, verbosity=verbosity, optimize_branch_lengths=optimize_branch_lengths)
return tree, GTRmat, F3x4_freqs
end

Expand Down
Loading
Loading