finemo.main

Main CLI module for the Fi-NeMo motif instance calling pipeline.

This module provides the command-line interface for all Fi-NeMo operations:

  • Data preprocessing from various genomic formats
  • Motif hit calling using the Fi-NeMo algorithm
  • Report generation and result visualization
  • Post-processing operations (hit collapsing, intersection)

The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet), and TF-MoDISco format.

   1"""Main CLI module for the Fi-NeMo motif instance calling pipeline.
   2
   3This module provides the command-line interface for all Fi-NeMo operations:
   4- Data preprocessing from various genomic formats
   5- Motif hit calling using the Fi-NeMo algorithm
   6- Report generation and result visualization
   7- Post-processing operations (hit collapsing, intersection)
   8
   9The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet),
  10and TF-MoDISco format.
  11"""
  12
  13from . import data_io
  14
  15import os
  16import argparse
  17import warnings
  18import inspect
  19from typing import Optional, List
  20
  21import polars as pl
  22
  23
  24def extract_regions_bw(
  25    peaks_path: str,
  26    chrom_order_path: Optional[str],
  27    fa_path: str,
  28    bw_paths: List[str],
  29    out_path: str,
  30    region_width: int = 1000,
  31) -> None:
  32    """Extract genomic regions and contribution scores from bigWig and FASTA files.
  33
  34    Parameters
  35    ----------
  36    peaks_path : str
  37        Path to ENCODE NarrowPeak format file.
  38    chrom_order_path : str, optional
  39        Path to chromosome ordering file.
  40    fa_path : str
  41        Path to genome FASTA file.
  42    bw_paths : List[str]
  43        List of bigWig file paths containing contribution scores.
  44    out_path : str
  45        Output path for NPZ file.
  46    region_width : int, default 1000
  47        Width of regions to extract around peak summits.
  48
  49    Notes
  50    -----
  51    BigWig files only provide projected contribution scores.
  52    """
  53    half_width = region_width // 2
  54
  55    # Load peak regions and extract sequences/contributions
  56    peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
  57    sequences, contribs = data_io.load_regions_from_bw(
  58        peaks_df, fa_path, bw_paths, half_width
  59    )
  60
  61    # Save processed data to NPZ format
  62    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
  63
  64
  65def extract_regions_chrombpnet_h5(
  66    peaks_path: Optional[str],
  67    chrom_order_path: Optional[str],
  68    h5_paths: List[str],
  69    out_path: str,
  70    region_width: int = 1000,
  71) -> None:
  72    """Extract genomic regions and contribution scores from ChromBPNet HDF5 files.
  73
  74    Parameters
  75    ----------
  76    peaks_path : str, optional
  77        Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
  78    chrom_order_path : str, optional
  79        Path to chromosome ordering file.
  80    h5_paths : List[str]
  81        List of ChromBPNet HDF5 file paths.
  82    out_path : str
  83        Output path for NPZ file.
  84    region_width : int, default 1000
  85        Width of regions to extract around peak summits.
  86    """
  87    half_width = region_width // 2
  88
  89    if peaks_path is not None:
  90        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
  91    else:
  92        peaks_df = None
  93
  94    sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width)
  95
  96    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
  97
  98
  99def extract_regions_bpnet_h5(
 100    peaks_path: Optional[str],
 101    chrom_order_path: Optional[str],
 102    h5_paths: List[str],
 103    out_path: str,
 104    region_width: int = 1000,
 105) -> None:
 106    """Extract genomic regions and contribution scores from BPNet HDF5 files.
 107
 108    Parameters
 109    ----------
 110    peaks_path : str, optional
 111        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
 112    chrom_order_path : str, optional
 113        Path to chromosome ordering file.
 114    h5_paths : List[str]
 115        List of BPNet HDF5 file paths.
 116    out_path : str
 117        Output path for NPZ file.
 118    region_width : int, default 1000
 119        Width of regions to extract around peak summits.
 120    """
 121    half_width = region_width // 2
 122
 123    if peaks_path is not None:
 124        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 125    else:
 126        peaks_df = None
 127
 128    sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width)
 129
 130    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
 131
 132
 133def extract_regions_modisco_fmt(
 134    peaks_path: Optional[str],
 135    chrom_order_path: Optional[str],
 136    shaps_paths: List[str],
 137    ohe_path: str,
 138    out_path: str,
 139    region_width: int = 1000,
 140) -> None:
 141    """Extract genomic regions and contribution scores from TF-MoDISco format files.
 142
 143    Parameters
 144    ----------
 145    peaks_path : str, optional
 146        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
 147    chrom_order_path : str, optional
 148        Path to chromosome ordering file.
 149    shaps_paths : List[str]
 150        List of paths to .npy/.npz files containing SHAP/attribution scores.
 151    ohe_path : str
 152        Path to .npy/.npz file containing one-hot encoded sequences.
 153    out_path : str
 154        Output path for NPZ file.
 155    region_width : int, default 1000
 156        Width of regions to extract around peak summits.
 157    """
 158    half_width = region_width // 2
 159
 160    if peaks_path is not None:
 161        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 162    else:
 163        peaks_df = None
 164
 165    sequences, contribs = data_io.load_regions_from_modisco_fmt(
 166        shaps_paths, ohe_path, half_width
 167    )
 168
 169    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
 170
 171
 172def call_hits(
 173    regions_path: str,
 174    peaks_path: Optional[str],
 175    modisco_h5_path: str,
 176    chrom_order_path: Optional[str],
 177    motifs_include_path: Optional[str],
 178    motif_names_path: Optional[str],
 179    motif_lambdas_path: Optional[str],
 180    out_dir: str,
 181    cwm_trim_coords_path: Optional[str] = None,
 182    cwm_trim_thresholds_path: Optional[str] = None,
 183    cwm_trim_threshold_default: float = 0.3,
 184    lambda_default: float = 0.7,
 185    step_size_max: float = 3.0,
 186    step_size_min: float = 0.08,
 187    sqrt_transform: bool = False,
 188    convergence_tol: float = 0.0005,
 189    max_steps: int = 10000,
 190    batch_size: int = 2000,
 191    step_adjust: float = 0.7,
 192    device: Optional[str] = None,
 193    mode: str = "pp",
 194    no_post_filter: bool = False,
 195    compile_optimizer: bool = False,
 196) -> None:
 197    """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.
 198
 199    This function implements the core Fi-NeMo hit calling pipeline, which identifies
 200    motif instances by solving a sparse reconstruction problem using proximal gradient
 201    descent. The algorithm represents contribution scores as weighted combinations of
 202    motif CWMs at specific positions.
 203
 204    Parameters
 205    ----------
 206    regions_path : str
 207        Path to NPZ file containing preprocessed regions (sequences, contributions,
 208        and optional peak coordinates).
 209    peaks_path : str, optional
 210        DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be
 211        included during preprocessing instead.
 212    modisco_h5_path : str
 213        Path to TF-MoDISco H5 file containing motif CWMs.
 214    chrom_order_path : str, optional
 215        DEPRECATED. Path to chromosome ordering file.
 216    motifs_include_path : str, optional
 217        Path to file listing motif names to include in analysis.
 218    motif_names_path : str, optional
 219        Path to file mapping motif IDs to custom names.
 220    motif_lambdas_path : str, optional
 221        Path to file specifying per-motif lambda values.
 222    out_dir : str
 223        Output directory for results.
 224    cwm_trim_coords_path : str, optional
 225        Path to file specifying custom motif trimming coordinates.
 226    cwm_trim_thresholds_path : str, optional
 227        Path to file specifying custom motif trimming thresholds.
 228    cwm_trim_threshold_default : float, default 0.3
 229        Default threshold for motif trimming.
 230    lambda_default : float, default 0.7
 231        Default L1 regularization weight.
 232    step_size_max : float, default 3.0
 233        Maximum optimization step size.
 234    step_size_min : float, default 0.08
 235        Minimum optimization step size.
 236    sqrt_transform : bool, default False
 237        Whether to apply signed square root transform to contributions.
 238    convergence_tol : float, default 0.0005
 239        Convergence tolerance for duality gap.
 240    max_steps : int, default 10000
 241        Maximum number of optimization steps.
 242    batch_size : int, default 2000
 243        Batch size for GPU processing.
 244    step_adjust : float, default 0.7
 245        Step size adjustment factor on divergence.
 246    device : str, optional
 247        DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
 248    mode : str, default "pp"
 249        Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
 250    no_post_filter : bool, default False
 251        If True, skip post-hit-calling similarity filtering.
 252    compile_optimizer : bool, default False
 253        Whether to JIT-compile the optimizer for speed.
 254
 255    Notes
 256    -----
 257    The Fi-NeMo algorithm solves the optimization problem:
 258    minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁
 259    subject to: c ≥ 0
 260
 261    where c represents motif hit coefficients and reconstruction uses convolution
 262    with motif CWMs.
 263    """
 264
 265    params = locals()
 266    import torch
 267    from . import hitcaller
 268
 269    if device is not None:
 270        warnings.warn(
 271            "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device."
 272        )
 273
 274    sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path)
 275
 276    region_width = sequences.shape[2]
 277    if region_width % 2 != 0:
 278        raise ValueError(f"Region width of {region_width} is not divisible by 2.")
 279
 280    half_width = region_width // 2
 281    num_regions = contribs.shape[0]
 282
 283    if peaks_path is not None:
 284        warnings.warn(
 285            "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
 286        )
 287        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 288        has_peaks = True
 289
 290    if not has_peaks:
 291        warnings.warn(
 292            "No peak region data provided. Output hits will lack absolute genomic coordinates."
 293        )
 294
 295    if mode == "pp":
 296        motif_type = "cwm"
 297        use_hypothetical_contribs = False
 298    elif mode == "ph":
 299        motif_type = "cwm"
 300        use_hypothetical_contribs = True
 301    elif mode == "hp":
 302        motif_type = "hcwm"
 303        use_hypothetical_contribs = False
 304    elif mode == "hh":
 305        motif_type = "hcwm"
 306        use_hypothetical_contribs = True
 307    else:
 308        raise ValueError(
 309            f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'."
 310        )
 311
 312    if motifs_include_path is not None:
 313        motifs_include = data_io.load_txt(motifs_include_path)
 314    else:
 315        motifs_include = None
 316
 317    if motif_names_path is not None:
 318        motif_name_map = data_io.load_mapping(motif_names_path, str)
 319    else:
 320        motif_name_map = None
 321
 322    if motif_lambdas_path is not None:
 323        motif_lambdas = data_io.load_mapping(motif_lambdas_path, float)
 324    else:
 325        motif_lambdas = None
 326
 327    if cwm_trim_coords_path is not None:
 328        trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int)
 329    else:
 330        trim_coords = None
 331
 332    if cwm_trim_thresholds_path is not None:
 333        trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float)
 334    else:
 335        trim_thresholds = None
 336
 337    motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs(
 338        modisco_h5_path,
 339        trim_coords,
 340        trim_thresholds,
 341        cwm_trim_threshold_default,
 342        motif_type,
 343        motifs_include,
 344        motif_name_map,
 345        motif_lambdas,
 346        lambda_default,
 347        True,
 348    )
 349    num_motifs = cwms.shape[0]
 350    motif_width = cwms.shape[2]
 351    lambdas = motifs_df.get_column("lambda").to_numpy(writable=True)
 352
 353    device_obj = torch.device(device) if device is not None else None
 354    hits_df, qc_df = hitcaller.fit_contribs(
 355        cwms,
 356        contribs,
 357        sequences,
 358        trim_masks,
 359        use_hypothetical_contribs,
 360        lambdas,
 361        step_size_max,
 362        step_size_min,
 363        sqrt_transform,
 364        convergence_tol,
 365        max_steps,
 366        batch_size,
 367        step_adjust,
 368        not no_post_filter,
 369        device_obj,
 370        compile_optimizer,
 371    )
 372
 373    os.makedirs(out_dir, exist_ok=True)
 374    out_path_qc = os.path.join(out_dir, "peaks_qc.tsv")
 375    data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width)
 376    data_io.write_qc(qc_df, peaks_df, out_path_qc)
 377
 378    out_path_motif_df = os.path.join(out_dir, "motif_data.tsv")
 379    data_io.write_motifs_df(motifs_df, out_path_motif_df)
 380
 381    out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy")
 382    data_io.write_motif_cwms(cwms, out_path_motif_cwms)
 383
 384    params |= {
 385        "region_width": region_width,
 386        "num_regions": num_regions,
 387        "untrimmed_motif_width": motif_width,
 388        "num_motifs": num_motifs,
 389    }
 390
 391    out_path_params = os.path.join(out_dir, "parameters.json")
 392    data_io.write_params(params, out_path_params)
 393
 394
 395def report(
 396    regions_path: str,
 397    hits_dir: str,
 398    modisco_h5_path: Optional[str],
 399    peaks_path: Optional[str],
 400    motifs_include_path: Optional[str],
 401    motif_names_path: Optional[str],
 402    out_dir: str,
 403    modisco_region_width: int = 400,
 404    cwm_trim_threshold: float = 0.3,
 405    compute_recall: bool = True,
 406    use_seqlets: bool = True,
 407) -> None:
 408    """Generate comprehensive HTML report with statistics and visualizations.
 409
 410    This function creates detailed analysis reports comparing Fi-NeMo hit calling
 411    results with TF-MoDISco seqlets, including performance metrics, distribution
 412    plots, and motif visualization. The report provides insights into hit calling
 413    quality and motif discovery accuracy.
 414
 415    Parameters
 416    ----------
 417    regions_path : str
 418        Path to NPZ file containing the same regions used for hit calling.
 419    hits_dir : str
 420        Path to directory containing Fi-NeMo hit calling outputs.
 421    modisco_h5_path : str, optional
 422        Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
 423    peaks_path : str, optional
 424        DEPRECATED. Peak coordinates should be included in regions file.
 425    motifs_include_path : str, optional
 426        DEPRECATED. This information is inferred from hit calling outputs.
 427    motif_names_path : str, optional
 428        DEPRECATED. This information is inferred from hit calling outputs.
 429    out_dir : str
 430        Output directory for report files.
 431    modisco_region_width : int, default 400
 432        Width of regions used by TF-MoDISco (needed for coordinate conversion).
 433    cwm_trim_threshold : float, default 0.3
 434        DEPRECATED. This information is inferred from hit calling outputs.
 435    compute_recall : bool, default True
 436        Whether to compute recall metrics against TF-MoDISco seqlets.
 437    use_seqlets : bool, default True
 438        Whether to include seqlet-based comparisons in the report.
 439
 440    Notes
 441    -----
 442    The generated report includes:
 443    - Hit vs seqlet count comparisons
 444    - Motif CWM visualizations
 445    - Hit statistic distributions
 446    - Co-occurrence heatmaps
 447    - Confusion matrices for overlapping motifs
 448    """
 449    from . import evaluation, visualization
 450
 451    sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path)
 452    if len(contribs.shape) == 3:
 453        regions = contribs * sequences
 454    elif len(contribs.shape) == 2:
 455        regions = contribs[:, None, :] * sequences
 456    else:
 457        raise ValueError(f"Unexpected contribs shape: {contribs.shape}")
 458
 459    half_width = regions.shape[2] // 2
 460    modisco_half_width = modisco_region_width // 2
 461
 462    if peaks_path is not None:
 463        warnings.warn(
 464            "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
 465        )
 466        peaks_df = data_io.load_peaks(peaks_path, None, half_width)
 467
 468    if hits_dir.endswith(".tsv"):
 469        warnings.warn(
 470            "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead."
 471        )
 472
 473        hits_path = hits_dir
 474
 475        hits_df = data_io.load_hits(hits_path, lazy=True)
 476
 477        if motifs_include_path is not None:
 478            motifs_include = data_io.load_txt(motifs_include_path)
 479        else:
 480            motifs_include = None
 481
 482        if motif_names_path is not None:
 483            motif_name_map = data_io.load_mapping(motif_names_path, str)
 484        else:
 485            motif_name_map = None
 486
 487        if modisco_h5_path is not None:
 488            motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs(
 489                modisco_h5_path,
 490                None,
 491                None,
 492                cwm_trim_threshold,
 493                "cwm",
 494                motifs_include,
 495                motif_name_map,
 496                None,
 497                1.0,
 498                True,
 499            )
 500        else:
 501            # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs
 502            # This will cause an error later, but that's expected behavior
 503            raise ValueError(
 504                "modisco_h5_path is required when providing a hits.tsv file directly"
 505            )
 506
 507    else:
 508        hits_df_path = os.path.join(hits_dir, "hits.tsv")
 509        hits_df = data_io.load_hits(hits_df_path, lazy=True)
 510
 511        motifs_df_path = os.path.join(hits_dir, "motif_data.tsv")
 512        motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path)
 513
 514        cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy")
 515        cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path)
 516
 517        params_path = os.path.join(hits_dir, "parameters.json")
 518        params = data_io.load_params(params_path)
 519        cwm_trim_threshold = params["cwm_trim_threshold_default"]
 520
 521    if not use_seqlets:
 522        warnings.warn(
 523            "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead."
 524        )
 525        seqlets_df = None
 526    elif modisco_h5_path is None:
 527        compute_recall = False
 528        seqlets_df = None
 529    else:
 530        seqlets_df = data_io.load_modisco_seqlets(
 531            modisco_h5_path,
 532            peaks_df,
 533            motifs_df,
 534            half_width,
 535            modisco_half_width,
 536            lazy=True,
 537        )
 538
 539    motif_width = cwms_modisco.shape[2]
 540
 541    # Convert to LazyFrame if needed and ensure motif_names is a list
 542    if isinstance(hits_df, pl.LazyFrame):
 543        hits_df_lazy: pl.LazyFrame = hits_df
 544    else:
 545        hits_df_lazy: pl.LazyFrame = hits_df.lazy()
 546
 547    motif_names_list: List[str] = list(motif_names)
 548
 549    occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list)
 550
 551    report_data, report_df, cwms, trim_bounds = evaluation.tfmodisco_comparison(
 552        regions,
 553        hits_df,
 554        peaks_df,
 555        seqlets_df,
 556        motifs_df,
 557        cwms_modisco,
 558        motif_names_list,
 559        modisco_half_width,
 560        motif_width,
 561        compute_recall,
 562    )
 563
 564    if seqlets_df is not None:
 565        confusion_df, confusion_mat = evaluation.seqlet_confusion(
 566            hits_df, seqlets_df, peaks_df, motif_names_list, motif_width
 567        )
 568    else:
 569        confusion_df, confusion_mat = None, None
 570
 571    os.makedirs(out_dir, exist_ok=True)
 572
 573    occ_path = os.path.join(out_dir, "motif_occurrences.tsv")
 574    data_io.write_occ_df(occ_df, occ_path)
 575
 576    data_io.write_report_data(report_df, cwms, out_dir)
 577
 578    visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir)
 579    visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir)
 580    visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir)
 581
 582    plot_dir = os.path.join(out_dir, "CWMs")
 583    visualization.plot_cwms(cwms, trim_bounds, plot_dir)
 584
 585    if seqlets_df is not None:
 586        seqlets_collected = (
 587            seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df
 588        )
 589        seqlets_path = os.path.join(out_dir, "seqlets.tsv")
 590        data_io.write_modisco_seqlets(seqlets_collected, seqlets_path)
 591
 592        if confusion_df is not None and confusion_mat is not None:
 593            seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv")
 594            data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path)
 595
 596            visualization.plot_hit_vs_seqlet_counts(report_data, out_dir)
 597            visualization.plot_seqlet_confusion_heatmap(
 598                confusion_mat, motif_names_list, out_dir
 599            )
 600
 601    report_path = os.path.join(out_dir, "report.html")
 602    visualization.write_report(
 603        report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None
 604    )
 605
 606
 607def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
 608    """Collapse overlapping hits by selecting the best hit per overlapping group.
 609
 610    This function processes a set of motif hits and identifies overlapping hits,
 611    keeping only the hit with the highest similarity score within each overlapping
 612    group. This reduces redundancy in hit calls while preserving the most confident
 613    predictions.
 614
 615    Parameters
 616    ----------
 617    hits_path : str
 618        Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
 619    out_path : str
 620        Path to output TSV file with additional 'is_primary' column.
 621    overlap_frac : float, default 0.2
 622        Minimum fractional overlap for considering hits as overlapping.
 623        For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
 624
 625    Notes
 626    -----
 627    The algorithm uses a sweep line approach with a heap data structure to
 628    efficiently identify overlapping intervals and select the best hit based
 629    on similarity scores.
 630    """
 631    from . import postprocessing
 632
 633    hits_df = data_io.load_hits(hits_path, lazy=False)
 634    hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac)
 635
 636    data_io.write_hits_processed(
 637        hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES
 638    )
 639
 640
 641def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
 642    """Find intersection of hits across multiple Fi-NeMo runs.
 643
 644    This function identifies motif hits that are consistently called across
 645    multiple independent runs, providing a way to assess reproducibility and
 646    identify high-confidence hits.
 647
 648    Parameters
 649    ----------
 650    hits_paths : List[str]
 651        List of paths to input TSV files from different runs.
 652    out_path : str
 653        Path to output TSV file containing intersection results.
 654        Duplicate columns are suffixed with run index.
 655    relaxed : bool, default False
 656        If True, uses relaxed intersection criteria based only on motif names
 657        and untrimmed coordinates. If False, assumes consistent region definitions
 658        and motif trimming across runs.
 659
 660    Notes
 661    -----
 662    The strict intersection mode requires consistent input regions and motif
 663    processing parameters across all runs. The relaxed mode is more permissive
 664    but may not be suitable when genomic coordinates are unavailable.
 665    """
 666    from . import postprocessing
 667
 668    hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths]
 669    hits_df = postprocessing.intersect_hits(hits_dfs, relaxed)
 670
 671    data_io.write_hits_processed(hits_df, out_path, schema=None)
 672
 673
 674def cli() -> None:
 675    """Command-line interface for the Fi-NeMo motif instance calling pipeline.
 676
 677    This function provides the main entry point for all Fi-NeMo operations including:
 678    - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
 679    - Motif hit calling using the Fi-NeMo algorithm
 680    - Report generation and visualization
 681    - Post-processing operations (hit collapsing, intersection)
 682    """
 683    parser = argparse.ArgumentParser()
 684    subparsers = parser.add_subparsers(required=True, dest="cmd")
 685
 686    extract_regions_bw_parser = subparsers.add_parser(
 687        "extract-regions-bw",
 688        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 689        help="Extract sequences and contributions from FASTA and bigwig files.",
 690    )
 691
 692    extract_regions_bw_parser.add_argument(
 693        "-p",
 694        "--peaks",
 695        type=str,
 696        required=True,
 697        help="A peak regions file in ENCODE NarrowPeak format.",
 698    )
 699    extract_regions_bw_parser.add_argument(
 700        "-C",
 701        "--chrom-order",
 702        type=str,
 703        default=None,
 704        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 705    )
 706    extract_regions_bw_parser.add_argument(
 707        "-f",
 708        "--fasta",
 709        type=str,
 710        required=True,
 711        help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.",
 712    )
 713    extract_regions_bw_parser.add_argument(
 714        "-b",
 715        "--bigwigs",
 716        type=str,
 717        required=True,
 718        nargs="+",
 719        help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 720    )
 721
 722    extract_regions_bw_parser.add_argument(
 723        "-o",
 724        "--out-path",
 725        type=str,
 726        required=True,
 727        help="The path to the output .npz file.",
 728    )
 729
 730    extract_regions_bw_parser.add_argument(
 731        "-w",
 732        "--region-width",
 733        type=int,
 734        default=inspect.signature(extract_regions_bw)
 735        .parameters["region_width"]
 736        .default,
 737        help="The width of the input region centered around each peak summit.",
 738    )
 739
 740    extract_chrombpnet_regions_h5_parser = subparsers.add_parser(
 741        "extract-regions-chrombpnet-h5",
 742        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 743        help="Extract sequences and contributions from ChromBPNet contributions H5 files.",
 744    )
 745
 746    extract_chrombpnet_regions_h5_parser.add_argument(
 747        "-p",
 748        "--peaks",
 749        type=str,
 750        default=None,
 751        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 752    )
 753    extract_chrombpnet_regions_h5_parser.add_argument(
 754        "-C",
 755        "--chrom-order",
 756        type=str,
 757        default=None,
 758        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 759    )
 760
 761    extract_chrombpnet_regions_h5_parser.add_argument(
 762        "-c",
 763        "--h5s",
 764        type=str,
 765        required=True,
 766        nargs="+",
 767        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 768    )
 769
 770    extract_chrombpnet_regions_h5_parser.add_argument(
 771        "-o",
 772        "--out-path",
 773        type=str,
 774        required=True,
 775        help="The path to the output .npz file.",
 776    )
 777
 778    extract_chrombpnet_regions_h5_parser.add_argument(
 779        "-w",
 780        "--region-width",
 781        type=int,
 782        default=inspect.signature(extract_regions_chrombpnet_h5)
 783        .parameters["region_width"]
 784        .default,
 785        help="The width of the input region centered around each peak summit.",
 786    )
 787
 788    extract_regions_h5_parser = subparsers.add_parser(
 789        "extract-regions-h5",
 790        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 791        help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.",
 792    )
 793
 794    extract_regions_h5_parser.add_argument(
 795        "-p",
 796        "--peaks",
 797        type=str,
 798        default=None,
 799        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 800    )
 801    extract_regions_h5_parser.add_argument(
 802        "-C",
 803        "--chrom-order",
 804        type=str,
 805        default=None,
 806        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 807    )
 808
 809    extract_regions_h5_parser.add_argument(
 810        "-c",
 811        "--h5s",
 812        type=str,
 813        required=True,
 814        nargs="+",
 815        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 816    )
 817
 818    extract_regions_h5_parser.add_argument(
 819        "-o",
 820        "--out-path",
 821        type=str,
 822        required=True,
 823        help="The path to the output .npz file.",
 824    )
 825
 826    extract_regions_h5_parser.add_argument(
 827        "-w",
 828        "--region-width",
 829        type=int,
 830        default=inspect.signature(extract_regions_chrombpnet_h5)
 831        .parameters["region_width"]
 832        .default,
 833        help="The width of the input region centered around each peak summit.",
 834    )
 835
 836    extract_bpnet_regions_h5_parser = subparsers.add_parser(
 837        "extract-regions-bpnet-h5",
 838        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 839        help="Extract sequences and contributions from BPNet contributions H5 files.",
 840    )
 841
 842    extract_bpnet_regions_h5_parser.add_argument(
 843        "-p",
 844        "--peaks",
 845        type=str,
 846        default=None,
 847        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 848    )
 849    extract_bpnet_regions_h5_parser.add_argument(
 850        "-C",
 851        "--chrom-order",
 852        type=str,
 853        default=None,
 854        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 855    )
 856
 857    extract_bpnet_regions_h5_parser.add_argument(
 858        "-c",
 859        "--h5s",
 860        type=str,
 861        required=True,
 862        nargs="+",
 863        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 864    )
 865
 866    extract_bpnet_regions_h5_parser.add_argument(
 867        "-o",
 868        "--out-path",
 869        type=str,
 870        required=True,
 871        help="The path to the output .npz file.",
 872    )
 873
 874    extract_bpnet_regions_h5_parser.add_argument(
 875        "-w",
 876        "--region-width",
 877        type=int,
 878        default=inspect.signature(extract_regions_bpnet_h5)
 879        .parameters["region_width"]
 880        .default,
 881        help="The width of the input region centered around each peak summit.",
 882    )
 883
 884    extract_regions_modisco_fmt_parser = subparsers.add_parser(
 885        "extract-regions-modisco-fmt",
 886        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 887        help="Extract sequences and contributions from tfmodisco-lite input files.",
 888    )
 889
 890    extract_regions_modisco_fmt_parser.add_argument(
 891        "-p",
 892        "--peaks",
 893        type=str,
 894        default=None,
 895        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 896    )
 897    extract_regions_modisco_fmt_parser.add_argument(
 898        "-C",
 899        "--chrom-order",
 900        type=str,
 901        default=None,
 902        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 903    )
 904
 905    extract_regions_modisco_fmt_parser.add_argument(
 906        "-s",
 907        "--sequences",
 908        type=str,
 909        required=True,
 910        help="A .npy or .npz file containing one-hot encoded sequences.",
 911    )
 912
 913    extract_regions_modisco_fmt_parser.add_argument(
 914        "-a",
 915        "--attributions",
 916        type=str,
 917        required=True,
 918        nargs="+",
 919        help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 920    )
 921
 922    extract_regions_modisco_fmt_parser.add_argument(
 923        "-o",
 924        "--out-path",
 925        type=str,
 926        required=True,
 927        help="The path to the output .npz file.",
 928    )
 929
 930    extract_regions_modisco_fmt_parser.add_argument(
 931        "-w",
 932        "--region-width",
 933        type=int,
 934        default=inspect.signature(extract_regions_modisco_fmt)
 935        .parameters["region_width"]
 936        .default,
 937        help="The width of the input region centered around each peak summit.",
 938    )
 939
 940    call_hits_parser = subparsers.add_parser(
 941        "call-hits",
 942        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 943        help="Call hits on provided sequences, contributions, and motif CWM's.",
 944    )
 945
 946    call_hits_parser.add_argument(
 947        "-M",
 948        "--mode",
 949        type=str,
 950        default=inspect.signature(call_hits).parameters["mode"].default,
 951        choices={"pp", "ph", "hp", "hh"},
 952        help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.",
 953    )
 954
 955    call_hits_parser.add_argument(
 956        "-r",
 957        "--regions",
 958        type=str,
 959        required=True,
 960        help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.",
 961    )
 962    call_hits_parser.add_argument(
 963        "-m",
 964        "--modisco-h5",
 965        type=str,
 966        required=True,
 967        help="A tfmodisco-lite output H5 file of motif patterns.",
 968    )
 969
 970    call_hits_parser.add_argument(
 971        "-p",
 972        "--peaks",
 973        type=str,
 974        default=None,
 975        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 976    )
 977    call_hits_parser.add_argument(
 978        "-C",
 979        "--chrom-order",
 980        type=str,
 981        default=None,
 982        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 983    )
 984
 985    call_hits_parser.add_argument(
 986        "-I",
 987        "--motifs-include",
 988        type=str,
 989        default=None,
 990        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.",
 991    )
 992    call_hits_parser.add_argument(
 993        "-N",
 994        "--motif-names",
 995        type=str,
 996        default=None,
 997        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.",
 998    )
 999
1000    call_hits_parser.add_argument(
1001        "-o",
1002        "--out-dir",
1003        type=str,
1004        required=True,
1005        help="The path to the output directory.",
1006    )
1007
1008    call_hits_parser.add_argument(
1009        "-t",
1010        "--cwm-trim-threshold",
1011        type=float,
1012        default=inspect.signature(call_hits)
1013        .parameters["cwm_trim_threshold_default"]
1014        .default,
1015        help="The default threshold to determine motif start and end positions within the full CWMs.",
1016    )
1017    call_hits_parser.add_argument(
1018        "-T",
1019        "--cwm-trim-thresholds",
1020        type=str,
1021        default=None,
1022        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.",
1023    )
1024    call_hits_parser.add_argument(
1025        "-R",
1026        "--cwm-trim-coords",
1027        type=str,
1028        default=None,
1029        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.",
1030    )
1031
1032    call_hits_parser.add_argument(
1033        "-l",
1034        "--global-lambda",
1035        type=float,
1036        default=inspect.signature(call_hits).parameters["lambda_default"].default,
1037        help="The default L1 regularization weight determining the sparsity of hits.",
1038    )
1039    call_hits_parser.add_argument(
1040        "-L",
1041        "--motif-lambdas",
1042        type=str,
1043        default=None,
1044        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.",
1045    )
1046    call_hits_parser.add_argument(
1047        "-a",
1048        "--alpha",
1049        type=float,
1050        default=None,
1051        help="DEPRECATED: Please use the `--lambda` argument instead.",
1052    )
1053    call_hits_parser.add_argument(
1054        "-A",
1055        "--motif-alphas",
1056        type=str,
1057        default=None,
1058        help="DEPRECATED: Please use the `--motif-lambdas` argument instead.",
1059    )
1060
1061    call_hits_parser.add_argument(
1062        "-f",
1063        "--no-post-filter",
1064        action="store_true",
1065        help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.",
1066    )
1067    call_hits_parser.add_argument(
1068        "-q",
1069        "--sqrt-transform",
1070        action="store_true",
1071        help="Apply a signed square root transform to the input contributions and CWMs before hit calling.",
1072    )
1073    call_hits_parser.add_argument(
1074        "-s",
1075        "--step-size-max",
1076        type=float,
1077        default=inspect.signature(call_hits).parameters["step_size_max"].default,
1078        help="The maximum optimizer step size.",
1079    )
1080    call_hits_parser.add_argument(
1081        "-i",
1082        "--step-size-min",
1083        type=float,
1084        default=inspect.signature(call_hits).parameters["step_size_min"].default,
1085        help="The minimum optimizer step size.",
1086    )
1087    call_hits_parser.add_argument(
1088        "-j",
1089        "--step-adjust",
1090        type=float,
1091        default=inspect.signature(call_hits).parameters["step_adjust"].default,
1092        help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor",
1093    )
1094    call_hits_parser.add_argument(
1095        "-c",
1096        "--convergence-tol",
1097        type=float,
1098        default=inspect.signature(call_hits).parameters["convergence_tol"].default,
1099        help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.",
1100    )
1101    call_hits_parser.add_argument(
1102        "-S",
1103        "--max-steps",
1104        type=int,
1105        default=inspect.signature(call_hits).parameters["max_steps"].default,
1106        help="The maximum number of optimization steps.",
1107    )
1108    call_hits_parser.add_argument(
1109        "-b",
1110        "--batch-size",
1111        type=int,
1112        default=inspect.signature(call_hits).parameters["batch_size"].default,
1113        help="The batch size used for optimization.",
1114    )
1115    call_hits_parser.add_argument(
1116        "-d",
1117        "--device",
1118        type=str,
1119        default=None,
1120        help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.",
1121    )
1122    call_hits_parser.add_argument(
1123        "-J",
1124        "--compile",
1125        action="store_true",
1126        help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.",
1127    )
1128
1129    report_parser = subparsers.add_parser(
1130        "report",
1131        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1132        help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.",
1133    )
1134
1135    report_parser.add_argument(
1136        "-r",
1137        "--regions",
1138        type=str,
1139        required=True,
1140        help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.",
1141    )
1142    report_parser.add_argument(
1143        "-H",
1144        "--hits",
1145        type=str,
1146        required=True,
1147        help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.",
1148    )
1149    report_parser.add_argument(
1150        "-p",
1151        "--peaks",
1152        type=str,
1153        default=None,
1154        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
1155    )
1156    report_parser.add_argument(
1157        "-m",
1158        "--modisco-h5",
1159        type=str,
1160        default=None,
1161        help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.",
1162    )
1163    report_parser.add_argument(
1164        "-I",
1165        "--motifs-include",
1166        type=str,
1167        default=None,
1168        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1169    )
1170    report_parser.add_argument(
1171        "-N",
1172        "--motif-names",
1173        type=str,
1174        default=None,
1175        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1176    )
1177
1178    report_parser.add_argument(
1179        "-o",
1180        "--out-dir",
1181        type=str,
1182        required=True,
1183        help="The path to the report output directory.",
1184    )
1185
1186    report_parser.add_argument(
1187        "-W",
1188        "--modisco-region-width",
1189        type=int,
1190        default=inspect.signature(report).parameters["modisco_region_width"].default,
1191        help="The width of the region around each peak summit used by tfmodisco-lite.",
1192    )
1193    report_parser.add_argument(
1194        "-t",
1195        "--cwm-trim-threshold",
1196        type=float,
1197        default=inspect.signature(report).parameters["cwm_trim_threshold"].default,
1198        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1199    )
1200    report_parser.add_argument(
1201        "-n",
1202        "--no-recall",
1203        action="store_true",
1204        help="Do not compute motif recall metrics.",
1205    )
1206    report_parser.add_argument(
1207        "-s",
1208        "--no-seqlets",
1209        action="store_true",
1210        help="DEPRECATED: Please omit the `--modisco-h5` argument instead.",
1211    )
1212
1213    collapse_hits_parser = subparsers.add_parser(
1214        "collapse-hits",
1215        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1216        help="Identify best hit by motif similarity among sets of overlapping hits.",
1217    )
1218
1219    collapse_hits_parser.add_argument(
1220        "-i",
1221        "--hits",
1222        type=str,
1223        required=True,
1224        help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.",
1225    )
1226    collapse_hits_parser.add_argument(
1227        "-o",
1228        "--out-path",
1229        type=str,
1230        required=True,
1231        help='The path to the output .tsv file with an additional "is_primary" column.',
1232    )
1233    collapse_hits_parser.add_argument(
1234        "-O",
1235        "--overlap-frac",
1236        type=float,
1237        default=inspect.signature(collapse_hits).parameters["overlap_frac"].default,
1238        help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.",
1239    )
1240
1241    intersect_hits_parser = subparsers.add_parser(
1242        "intersect-hits",
1243        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1244        help="Intersect hits across multiple runs.",
1245    )
1246
1247    intersect_hits_parser.add_argument(
1248        "-i",
1249        "--hits",
1250        type=str,
1251        required=True,
1252        nargs="+",
1253        help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.",
1254    )
1255    intersect_hits_parser.add_argument(
1256        "-o",
1257        "--out-path",
1258        type=str,
1259        required=True,
1260        help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.",
1261    )
1262    intersect_hits_parser.add_argument(
1263        "-r",
1264        "--relaxed",
1265        action="store_true",
1266        help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.",
1267    )
1268
1269    args = parser.parse_args()
1270
1271    if args.cmd == "extract-regions-bw":
1272        extract_regions_bw(
1273            args.peaks,
1274            args.chrom_order,
1275            args.fasta,
1276            args.bigwigs,
1277            args.out_path,
1278            args.region_width,
1279        )
1280
1281    elif args.cmd == "extract-regions-chrombpnet-h5":
1282        extract_regions_chrombpnet_h5(
1283            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1284        )
1285
1286    elif args.cmd == "extract-regions-h5":
1287        print(
1288            "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead."
1289        )
1290        extract_regions_chrombpnet_h5(
1291            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1292        )
1293
1294    elif args.cmd == "extract-regions-bpnet-h5":
1295        extract_regions_bpnet_h5(
1296            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1297        )
1298
1299    elif args.cmd == "extract-regions-modisco-fmt":
1300        extract_regions_modisco_fmt(
1301            args.peaks,
1302            args.chrom_order,
1303            args.attributions,
1304            args.sequences,
1305            args.out_path,
1306            args.region_width,
1307        )
1308
1309    elif args.cmd == "call-hits":
1310        if args.alpha is not None:
1311            warnings.warn(
1312                "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead."
1313            )
1314            args.global_lambda = args.alpha
1315        if args.motif_alphas is not None:
1316            warnings.warn(
1317                "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead."
1318            )
1319            args.motif_lambdas = args.motif_alphas
1320
1321        call_hits(
1322            args.regions,
1323            args.peaks,
1324            args.modisco_h5,
1325            args.chrom_order,
1326            args.motifs_include,
1327            args.motif_names,
1328            args.motif_lambdas,
1329            args.out_dir,
1330            args.cwm_trim_coords,
1331            args.cwm_trim_thresholds,
1332            args.cwm_trim_threshold,
1333            args.global_lambda,
1334            args.step_size_max,
1335            args.step_size_min,
1336            args.sqrt_transform,
1337            args.convergence_tol,
1338            args.max_steps,
1339            args.batch_size,
1340            args.step_adjust,
1341            args.device,
1342            args.mode,
1343            args.no_post_filter,
1344            args.compile,
1345        )
1346
1347    elif args.cmd == "report":
1348        report(
1349            args.regions,
1350            args.hits,
1351            args.modisco_h5,
1352            args.peaks,
1353            args.motifs_include,
1354            args.motif_names,
1355            args.out_dir,
1356            args.modisco_region_width,
1357            args.cwm_trim_threshold,
1358            not args.no_recall,
1359            not args.no_seqlets,
1360        )
1361
1362    elif args.cmd == "collapse-hits":
1363        collapse_hits(args.hits, args.out_path, args.overlap_frac)
1364
1365    elif args.cmd == "intersect-hits":
1366        intersect_hits(args.hits, args.out_path, args.relaxed)
def extract_regions_bw( peaks_path: str, chrom_order_path: Optional[str], fa_path: str, bw_paths: List[str], out_path: str, region_width: int = 1000) -> None:
25def extract_regions_bw(
26    peaks_path: str,
27    chrom_order_path: Optional[str],
28    fa_path: str,
29    bw_paths: List[str],
30    out_path: str,
31    region_width: int = 1000,
32) -> None:
33    """Extract genomic regions and contribution scores from bigWig and FASTA files.
34
35    Parameters
36    ----------
37    peaks_path : str
38        Path to ENCODE NarrowPeak format file.
39    chrom_order_path : str, optional
40        Path to chromosome ordering file.
41    fa_path : str
42        Path to genome FASTA file.
43    bw_paths : List[str]
44        List of bigWig file paths containing contribution scores.
45    out_path : str
46        Output path for NPZ file.
47    region_width : int, default 1000
48        Width of regions to extract around peak summits.
49
50    Notes
51    -----
52    BigWig files only provide projected contribution scores.
53    """
54    half_width = region_width // 2
55
56    # Load peak regions and extract sequences/contributions
57    peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
58    sequences, contribs = data_io.load_regions_from_bw(
59        peaks_df, fa_path, bw_paths, half_width
60    )
61
62    # Save processed data to NPZ format
63    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from bigWig and FASTA files.

Parameters
  • peaks_path (str): Path to ENCODE NarrowPeak format file.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • fa_path (str): Path to genome FASTA file.
  • bw_paths (List[str]): List of bigWig file paths containing contribution scores.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
Notes

BigWig files only provide projected contribution scores.

def extract_regions_chrombpnet_h5( peaks_path: Optional[str], chrom_order_path: Optional[str], h5_paths: List[str], out_path: str, region_width: int = 1000) -> None:
66def extract_regions_chrombpnet_h5(
67    peaks_path: Optional[str],
68    chrom_order_path: Optional[str],
69    h5_paths: List[str],
70    out_path: str,
71    region_width: int = 1000,
72) -> None:
73    """Extract genomic regions and contribution scores from ChromBPNet HDF5 files.
74
75    Parameters
76    ----------
77    peaks_path : str, optional
78        Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
79    chrom_order_path : str, optional
80        Path to chromosome ordering file.
81    h5_paths : List[str]
82        List of ChromBPNet HDF5 file paths.
83    out_path : str
84        Output path for NPZ file.
85    region_width : int, default 1000
86        Width of regions to extract around peak summits.
87    """
88    half_width = region_width // 2
89
90    if peaks_path is not None:
91        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
92    else:
93        peaks_df = None
94
95    sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width)
96
97    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from ChromBPNet HDF5 files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • h5_paths (List[str]): List of ChromBPNet HDF5 file paths.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def extract_regions_bpnet_h5( peaks_path: Optional[str], chrom_order_path: Optional[str], h5_paths: List[str], out_path: str, region_width: int = 1000) -> None:
100def extract_regions_bpnet_h5(
101    peaks_path: Optional[str],
102    chrom_order_path: Optional[str],
103    h5_paths: List[str],
104    out_path: str,
105    region_width: int = 1000,
106) -> None:
107    """Extract genomic regions and contribution scores from BPNet HDF5 files.
108
109    Parameters
110    ----------
111    peaks_path : str, optional
112        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
113    chrom_order_path : str, optional
114        Path to chromosome ordering file.
115    h5_paths : List[str]
116        List of BPNet HDF5 file paths.
117    out_path : str
118        Output path for NPZ file.
119    region_width : int, default 1000
120        Width of regions to extract around peak summits.
121    """
122    half_width = region_width // 2
123
124    if peaks_path is not None:
125        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
126    else:
127        peaks_df = None
128
129    sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width)
130
131    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from BPNet HDF5 files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • h5_paths (List[str]): List of BPNet HDF5 file paths.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def extract_regions_modisco_fmt( peaks_path: Optional[str], chrom_order_path: Optional[str], shaps_paths: List[str], ohe_path: str, out_path: str, region_width: int = 1000) -> None:
134def extract_regions_modisco_fmt(
135    peaks_path: Optional[str],
136    chrom_order_path: Optional[str],
137    shaps_paths: List[str],
138    ohe_path: str,
139    out_path: str,
140    region_width: int = 1000,
141) -> None:
142    """Extract genomic regions and contribution scores from TF-MoDISco format files.
143
144    Parameters
145    ----------
146    peaks_path : str, optional
147        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
148    chrom_order_path : str, optional
149        Path to chromosome ordering file.
150    shaps_paths : List[str]
151        List of paths to .npy/.npz files containing SHAP/attribution scores.
152    ohe_path : str
153        Path to .npy/.npz file containing one-hot encoded sequences.
154    out_path : str
155        Output path for NPZ file.
156    region_width : int, default 1000
157        Width of regions to extract around peak summits.
158    """
159    half_width = region_width // 2
160
161    if peaks_path is not None:
162        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
163    else:
164        peaks_df = None
165
166    sequences, contribs = data_io.load_regions_from_modisco_fmt(
167        shaps_paths, ohe_path, half_width
168    )
169
170    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from TF-MoDISco format files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • shaps_paths (List[str]): List of paths to .npy/.npz files containing SHAP/attribution scores.
  • ohe_path (str): Path to .npy/.npz file containing one-hot encoded sequences.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def call_hits( regions_path: str, peaks_path: Optional[str], modisco_h5_path: str, chrom_order_path: Optional[str], motifs_include_path: Optional[str], motif_names_path: Optional[str], motif_lambdas_path: Optional[str], out_dir: str, cwm_trim_coords_path: Optional[str] = None, cwm_trim_thresholds_path: Optional[str] = None, cwm_trim_threshold_default: float = 0.3, lambda_default: float = 0.7, step_size_max: float = 3.0, step_size_min: float = 0.08, sqrt_transform: bool = False, convergence_tol: float = 0.0005, max_steps: int = 10000, batch_size: int = 2000, step_adjust: float = 0.7, device: Optional[str] = None, mode: str = 'pp', no_post_filter: bool = False, compile_optimizer: bool = False) -> None:
173def call_hits(
174    regions_path: str,
175    peaks_path: Optional[str],
176    modisco_h5_path: str,
177    chrom_order_path: Optional[str],
178    motifs_include_path: Optional[str],
179    motif_names_path: Optional[str],
180    motif_lambdas_path: Optional[str],
181    out_dir: str,
182    cwm_trim_coords_path: Optional[str] = None,
183    cwm_trim_thresholds_path: Optional[str] = None,
184    cwm_trim_threshold_default: float = 0.3,
185    lambda_default: float = 0.7,
186    step_size_max: float = 3.0,
187    step_size_min: float = 0.08,
188    sqrt_transform: bool = False,
189    convergence_tol: float = 0.0005,
190    max_steps: int = 10000,
191    batch_size: int = 2000,
192    step_adjust: float = 0.7,
193    device: Optional[str] = None,
194    mode: str = "pp",
195    no_post_filter: bool = False,
196    compile_optimizer: bool = False,
197) -> None:
198    """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.
199
200    This function implements the core Fi-NeMo hit calling pipeline, which identifies
201    motif instances by solving a sparse reconstruction problem using proximal gradient
202    descent. The algorithm represents contribution scores as weighted combinations of
203    motif CWMs at specific positions.
204
205    Parameters
206    ----------
207    regions_path : str
208        Path to NPZ file containing preprocessed regions (sequences, contributions,
209        and optional peak coordinates).
210    peaks_path : str, optional
211        DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be
212        included during preprocessing instead.
213    modisco_h5_path : str
214        Path to TF-MoDISco H5 file containing motif CWMs.
215    chrom_order_path : str, optional
216        DEPRECATED. Path to chromosome ordering file.
217    motifs_include_path : str, optional
218        Path to file listing motif names to include in analysis.
219    motif_names_path : str, optional
220        Path to file mapping motif IDs to custom names.
221    motif_lambdas_path : str, optional
222        Path to file specifying per-motif lambda values.
223    out_dir : str
224        Output directory for results.
225    cwm_trim_coords_path : str, optional
226        Path to file specifying custom motif trimming coordinates.
227    cwm_trim_thresholds_path : str, optional
228        Path to file specifying custom motif trimming thresholds.
229    cwm_trim_threshold_default : float, default 0.3
230        Default threshold for motif trimming.
231    lambda_default : float, default 0.7
232        Default L1 regularization weight.
233    step_size_max : float, default 3.0
234        Maximum optimization step size.
235    step_size_min : float, default 0.08
236        Minimum optimization step size.
237    sqrt_transform : bool, default False
238        Whether to apply signed square root transform to contributions.
239    convergence_tol : float, default 0.0005
240        Convergence tolerance for duality gap.
241    max_steps : int, default 10000
242        Maximum number of optimization steps.
243    batch_size : int, default 2000
244        Batch size for GPU processing.
245    step_adjust : float, default 0.7
246        Step size adjustment factor on divergence.
247    device : str, optional
248        DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
249    mode : str, default "pp"
250        Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
251    no_post_filter : bool, default False
252        If True, skip post-hit-calling similarity filtering.
253    compile_optimizer : bool, default False
254        Whether to JIT-compile the optimizer for speed.
255
256    Notes
257    -----
258    The Fi-NeMo algorithm solves the optimization problem:
259    minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁
260    subject to: c ≥ 0
261
262    where c represents motif hit coefficients and reconstruction uses convolution
263    with motif CWMs.
264    """
265
266    params = locals()
267    import torch
268    from . import hitcaller
269
270    if device is not None:
271        warnings.warn(
272            "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device."
273        )
274
275    sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path)
276
277    region_width = sequences.shape[2]
278    if region_width % 2 != 0:
279        raise ValueError(f"Region width of {region_width} is not divisible by 2.")
280
281    half_width = region_width // 2
282    num_regions = contribs.shape[0]
283
284    if peaks_path is not None:
285        warnings.warn(
286            "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
287        )
288        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
289        has_peaks = True
290
291    if not has_peaks:
292        warnings.warn(
293            "No peak region data provided. Output hits will lack absolute genomic coordinates."
294        )
295
296    if mode == "pp":
297        motif_type = "cwm"
298        use_hypothetical_contribs = False
299    elif mode == "ph":
300        motif_type = "cwm"
301        use_hypothetical_contribs = True
302    elif mode == "hp":
303        motif_type = "hcwm"
304        use_hypothetical_contribs = False
305    elif mode == "hh":
306        motif_type = "hcwm"
307        use_hypothetical_contribs = True
308    else:
309        raise ValueError(
310            f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'."
311        )
312
313    if motifs_include_path is not None:
314        motifs_include = data_io.load_txt(motifs_include_path)
315    else:
316        motifs_include = None
317
318    if motif_names_path is not None:
319        motif_name_map = data_io.load_mapping(motif_names_path, str)
320    else:
321        motif_name_map = None
322
323    if motif_lambdas_path is not None:
324        motif_lambdas = data_io.load_mapping(motif_lambdas_path, float)
325    else:
326        motif_lambdas = None
327
328    if cwm_trim_coords_path is not None:
329        trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int)
330    else:
331        trim_coords = None
332
333    if cwm_trim_thresholds_path is not None:
334        trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float)
335    else:
336        trim_thresholds = None
337
338    motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs(
339        modisco_h5_path,
340        trim_coords,
341        trim_thresholds,
342        cwm_trim_threshold_default,
343        motif_type,
344        motifs_include,
345        motif_name_map,
346        motif_lambdas,
347        lambda_default,
348        True,
349    )
350    num_motifs = cwms.shape[0]
351    motif_width = cwms.shape[2]
352    lambdas = motifs_df.get_column("lambda").to_numpy(writable=True)
353
354    device_obj = torch.device(device) if device is not None else None
355    hits_df, qc_df = hitcaller.fit_contribs(
356        cwms,
357        contribs,
358        sequences,
359        trim_masks,
360        use_hypothetical_contribs,
361        lambdas,
362        step_size_max,
363        step_size_min,
364        sqrt_transform,
365        convergence_tol,
366        max_steps,
367        batch_size,
368        step_adjust,
369        not no_post_filter,
370        device_obj,
371        compile_optimizer,
372    )
373
374    os.makedirs(out_dir, exist_ok=True)
375    out_path_qc = os.path.join(out_dir, "peaks_qc.tsv")
376    data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width)
377    data_io.write_qc(qc_df, peaks_df, out_path_qc)
378
379    out_path_motif_df = os.path.join(out_dir, "motif_data.tsv")
380    data_io.write_motifs_df(motifs_df, out_path_motif_df)
381
382    out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy")
383    data_io.write_motif_cwms(cwms, out_path_motif_cwms)
384
385    params |= {
386        "region_width": region_width,
387        "num_regions": num_regions,
388        "untrimmed_motif_width": motif_width,
389        "num_motifs": num_motifs,
390    }
391
392    out_path_params = os.path.join(out_dir, "parameters.json")
393    data_io.write_params(params, out_path_params)

Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.

This function implements the core Fi-NeMo hit calling pipeline, which identifies motif instances by solving a sparse reconstruction problem using proximal gradient descent. The algorithm represents contribution scores as weighted combinations of motif CWMs at specific positions.

Parameters
  • regions_path (str): Path to NPZ file containing preprocessed regions (sequences, contributions, and optional peak coordinates).
  • peaks_path (str, optional): DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be included during preprocessing instead.
  • modisco_h5_path (str): Path to TF-MoDISco H5 file containing motif CWMs.
  • chrom_order_path (str, optional): DEPRECATED. Path to chromosome ordering file.
  • motifs_include_path (str, optional): Path to file listing motif names to include in analysis.
  • motif_names_path (str, optional): Path to file mapping motif IDs to custom names.
  • motif_lambdas_path (str, optional): Path to file specifying per-motif lambda values.
  • out_dir (str): Output directory for results.
  • cwm_trim_coords_path (str, optional): Path to file specifying custom motif trimming coordinates.
  • cwm_trim_thresholds_path (str, optional): Path to file specifying custom motif trimming thresholds.
  • cwm_trim_threshold_default (float, default 0.3): Default threshold for motif trimming.
  • lambda_default (float, default 0.7): Default L1 regularization weight.
  • step_size_max (float, default 3.0): Maximum optimization step size.
  • step_size_min (float, default 0.08): Minimum optimization step size.
  • sqrt_transform (bool, default False): Whether to apply signed square root transform to contributions.
  • convergence_tol (float, default 0.0005): Convergence tolerance for duality gap.
  • max_steps (int, default 10000): Maximum number of optimization steps.
  • batch_size (int, default 2000): Batch size for GPU processing.
  • step_adjust (float, default 0.7): Step size adjustment factor on divergence.
  • device (str, optional): DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
  • mode (str, default "pp"): Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
  • no_post_filter (bool, default False): If True, skip post-hit-calling similarity filtering.
  • compile_optimizer (bool, default False): Whether to JIT-compile the optimizer for speed.
Notes

The Fi-NeMo algorithm solves the optimization problem: minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁ subject to: c ≥ 0

where c represents motif hit coefficients and reconstruction uses convolution with motif CWMs.

def report( regions_path: str, hits_dir: str, modisco_h5_path: Optional[str], peaks_path: Optional[str], motifs_include_path: Optional[str], motif_names_path: Optional[str], out_dir: str, modisco_region_width: int = 400, cwm_trim_threshold: float = 0.3, compute_recall: bool = True, use_seqlets: bool = True) -> None:
396def report(
397    regions_path: str,
398    hits_dir: str,
399    modisco_h5_path: Optional[str],
400    peaks_path: Optional[str],
401    motifs_include_path: Optional[str],
402    motif_names_path: Optional[str],
403    out_dir: str,
404    modisco_region_width: int = 400,
405    cwm_trim_threshold: float = 0.3,
406    compute_recall: bool = True,
407    use_seqlets: bool = True,
408) -> None:
409    """Generate comprehensive HTML report with statistics and visualizations.
410
411    This function creates detailed analysis reports comparing Fi-NeMo hit calling
412    results with TF-MoDISco seqlets, including performance metrics, distribution
413    plots, and motif visualization. The report provides insights into hit calling
414    quality and motif discovery accuracy.
415
416    Parameters
417    ----------
418    regions_path : str
419        Path to NPZ file containing the same regions used for hit calling.
420    hits_dir : str
421        Path to directory containing Fi-NeMo hit calling outputs.
422    modisco_h5_path : str, optional
423        Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
424    peaks_path : str, optional
425        DEPRECATED. Peak coordinates should be included in regions file.
426    motifs_include_path : str, optional
427        DEPRECATED. This information is inferred from hit calling outputs.
428    motif_names_path : str, optional
429        DEPRECATED. This information is inferred from hit calling outputs.
430    out_dir : str
431        Output directory for report files.
432    modisco_region_width : int, default 400
433        Width of regions used by TF-MoDISco (needed for coordinate conversion).
434    cwm_trim_threshold : float, default 0.3
435        DEPRECATED. This information is inferred from hit calling outputs.
436    compute_recall : bool, default True
437        Whether to compute recall metrics against TF-MoDISco seqlets.
438    use_seqlets : bool, default True
439        Whether to include seqlet-based comparisons in the report.
440
441    Notes
442    -----
443    The generated report includes:
444    - Hit vs seqlet count comparisons
445    - Motif CWM visualizations
446    - Hit statistic distributions
447    - Co-occurrence heatmaps
448    - Confusion matrices for overlapping motifs
449    """
450    from . import evaluation, visualization
451
452    sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path)
453    if len(contribs.shape) == 3:
454        regions = contribs * sequences
455    elif len(contribs.shape) == 2:
456        regions = contribs[:, None, :] * sequences
457    else:
458        raise ValueError(f"Unexpected contribs shape: {contribs.shape}")
459
460    half_width = regions.shape[2] // 2
461    modisco_half_width = modisco_region_width // 2
462
463    if peaks_path is not None:
464        warnings.warn(
465            "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
466        )
467        peaks_df = data_io.load_peaks(peaks_path, None, half_width)
468
469    if hits_dir.endswith(".tsv"):
470        warnings.warn(
471            "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead."
472        )
473
474        hits_path = hits_dir
475
476        hits_df = data_io.load_hits(hits_path, lazy=True)
477
478        if motifs_include_path is not None:
479            motifs_include = data_io.load_txt(motifs_include_path)
480        else:
481            motifs_include = None
482
483        if motif_names_path is not None:
484            motif_name_map = data_io.load_mapping(motif_names_path, str)
485        else:
486            motif_name_map = None
487
488        if modisco_h5_path is not None:
489            motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs(
490                modisco_h5_path,
491                None,
492                None,
493                cwm_trim_threshold,
494                "cwm",
495                motifs_include,
496                motif_name_map,
497                None,
498                1.0,
499                True,
500            )
501        else:
502            # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs
503            # This will cause an error later, but that's expected behavior
504            raise ValueError(
505                "modisco_h5_path is required when providing a hits.tsv file directly"
506            )
507
508    else:
509        hits_df_path = os.path.join(hits_dir, "hits.tsv")
510        hits_df = data_io.load_hits(hits_df_path, lazy=True)
511
512        motifs_df_path = os.path.join(hits_dir, "motif_data.tsv")
513        motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path)
514
515        cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy")
516        cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path)
517
518        params_path = os.path.join(hits_dir, "parameters.json")
519        params = data_io.load_params(params_path)
520        cwm_trim_threshold = params["cwm_trim_threshold_default"]
521
522    if not use_seqlets:
523        warnings.warn(
524            "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead."
525        )
526        seqlets_df = None
527    elif modisco_h5_path is None:
528        compute_recall = False
529        seqlets_df = None
530    else:
531        seqlets_df = data_io.load_modisco_seqlets(
532            modisco_h5_path,
533            peaks_df,
534            motifs_df,
535            half_width,
536            modisco_half_width,
537            lazy=True,
538        )
539
540    motif_width = cwms_modisco.shape[2]
541
542    # Convert to LazyFrame if needed and ensure motif_names is a list
543    if isinstance(hits_df, pl.LazyFrame):
544        hits_df_lazy: pl.LazyFrame = hits_df
545    else:
546        hits_df_lazy: pl.LazyFrame = hits_df.lazy()
547
548    motif_names_list: List[str] = list(motif_names)
549
550    occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list)
551
552    report_data, report_df, cwms, trim_bounds = evaluation.tfmodisco_comparison(
553        regions,
554        hits_df,
555        peaks_df,
556        seqlets_df,
557        motifs_df,
558        cwms_modisco,
559        motif_names_list,
560        modisco_half_width,
561        motif_width,
562        compute_recall,
563    )
564
565    if seqlets_df is not None:
566        confusion_df, confusion_mat = evaluation.seqlet_confusion(
567            hits_df, seqlets_df, peaks_df, motif_names_list, motif_width
568        )
569    else:
570        confusion_df, confusion_mat = None, None
571
572    os.makedirs(out_dir, exist_ok=True)
573
574    occ_path = os.path.join(out_dir, "motif_occurrences.tsv")
575    data_io.write_occ_df(occ_df, occ_path)
576
577    data_io.write_report_data(report_df, cwms, out_dir)
578
579    visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir)
580    visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir)
581    visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir)
582
583    plot_dir = os.path.join(out_dir, "CWMs")
584    visualization.plot_cwms(cwms, trim_bounds, plot_dir)
585
586    if seqlets_df is not None:
587        seqlets_collected = (
588            seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df
589        )
590        seqlets_path = os.path.join(out_dir, "seqlets.tsv")
591        data_io.write_modisco_seqlets(seqlets_collected, seqlets_path)
592
593        if confusion_df is not None and confusion_mat is not None:
594            seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv")
595            data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path)
596
597            visualization.plot_hit_vs_seqlet_counts(report_data, out_dir)
598            visualization.plot_seqlet_confusion_heatmap(
599                confusion_mat, motif_names_list, out_dir
600            )
601
602    report_path = os.path.join(out_dir, "report.html")
603    visualization.write_report(
604        report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None
605    )

Generate comprehensive HTML report with statistics and visualizations.

This function creates detailed analysis reports comparing Fi-NeMo hit calling results with TF-MoDISco seqlets, including performance metrics, distribution plots, and motif visualization. The report provides insights into hit calling quality and motif discovery accuracy.

Parameters
  • regions_path (str): Path to NPZ file containing the same regions used for hit calling.
  • hits_dir (str): Path to directory containing Fi-NeMo hit calling outputs.
  • modisco_h5_path (str, optional): Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
  • peaks_path (str, optional): DEPRECATED. Peak coordinates should be included in regions file.
  • motifs_include_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
  • motif_names_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
  • out_dir (str): Output directory for report files.
  • modisco_region_width (int, default 400): Width of regions used by TF-MoDISco (needed for coordinate conversion).
  • cwm_trim_threshold (float, default 0.3): DEPRECATED. This information is inferred from hit calling outputs.
  • compute_recall (bool, default True): Whether to compute recall metrics against TF-MoDISco seqlets.
  • use_seqlets (bool, default True): Whether to include seqlet-based comparisons in the report.
Notes

The generated report includes:

  • Hit vs seqlet count comparisons
  • Motif CWM visualizations
  • Hit statistic distributions
  • Co-occurrence heatmaps
  • Confusion matrices for overlapping motifs
def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
608def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
609    """Collapse overlapping hits by selecting the best hit per overlapping group.
610
611    This function processes a set of motif hits and identifies overlapping hits,
612    keeping only the hit with the highest similarity score within each overlapping
613    group. This reduces redundancy in hit calls while preserving the most confident
614    predictions.
615
616    Parameters
617    ----------
618    hits_path : str
619        Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
620    out_path : str
621        Path to output TSV file with additional 'is_primary' column.
622    overlap_frac : float, default 0.2
623        Minimum fractional overlap for considering hits as overlapping.
624        For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
625
626    Notes
627    -----
628    The algorithm uses a sweep line approach with a heap data structure to
629    efficiently identify overlapping intervals and select the best hit based
630    on similarity scores.
631    """
632    from . import postprocessing
633
634    hits_df = data_io.load_hits(hits_path, lazy=False)
635    hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac)
636
637    data_io.write_hits_processed(
638        hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES
639    )

Collapse overlapping hits by selecting the best hit per overlapping group.

This function processes a set of motif hits and identifies overlapping hits, keeping only the hit with the highest similarity score within each overlapping group. This reduces redundancy in hit calls while preserving the most confident predictions.

Parameters
  • hits_path (str): Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
  • out_path (str): Path to output TSV file with additional 'is_primary' column.
  • overlap_frac (float, default 0.2): Minimum fractional overlap for considering hits as overlapping. For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
Notes

The algorithm uses a sweep line approach with a heap data structure to efficiently identify overlapping intervals and select the best hit based on similarity scores.

def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
642def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
643    """Find intersection of hits across multiple Fi-NeMo runs.
644
645    This function identifies motif hits that are consistently called across
646    multiple independent runs, providing a way to assess reproducibility and
647    identify high-confidence hits.
648
649    Parameters
650    ----------
651    hits_paths : List[str]
652        List of paths to input TSV files from different runs.
653    out_path : str
654        Path to output TSV file containing intersection results.
655        Duplicate columns are suffixed with run index.
656    relaxed : bool, default False
657        If True, uses relaxed intersection criteria based only on motif names
658        and untrimmed coordinates. If False, assumes consistent region definitions
659        and motif trimming across runs.
660
661    Notes
662    -----
663    The strict intersection mode requires consistent input regions and motif
664    processing parameters across all runs. The relaxed mode is more permissive
665    but may not be suitable when genomic coordinates are unavailable.
666    """
667    from . import postprocessing
668
669    hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths]
670    hits_df = postprocessing.intersect_hits(hits_dfs, relaxed)
671
672    data_io.write_hits_processed(hits_df, out_path, schema=None)

Find intersection of hits across multiple Fi-NeMo runs.

This function identifies motif hits that are consistently called across multiple independent runs, providing a way to assess reproducibility and identify high-confidence hits.

Parameters
  • hits_paths (List[str]): List of paths to input TSV files from different runs.
  • out_path (str): Path to output TSV file containing intersection results. Duplicate columns are suffixed with run index.
  • relaxed (bool, default False): If True, uses relaxed intersection criteria based only on motif names and untrimmed coordinates. If False, assumes consistent region definitions and motif trimming across runs.
Notes

The strict intersection mode requires consistent input regions and motif processing parameters across all runs. The relaxed mode is more permissive but may not be suitable when genomic coordinates are unavailable.

def cli() -> None:
 675def cli() -> None:
 676    """Command-line interface for the Fi-NeMo motif instance calling pipeline.
 677
 678    This function provides the main entry point for all Fi-NeMo operations including:
 679    - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
 680    - Motif hit calling using the Fi-NeMo algorithm
 681    - Report generation and visualization
 682    - Post-processing operations (hit collapsing, intersection)
 683    """
 684    parser = argparse.ArgumentParser()
 685    subparsers = parser.add_subparsers(required=True, dest="cmd")
 686
 687    extract_regions_bw_parser = subparsers.add_parser(
 688        "extract-regions-bw",
 689        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 690        help="Extract sequences and contributions from FASTA and bigwig files.",
 691    )
 692
 693    extract_regions_bw_parser.add_argument(
 694        "-p",
 695        "--peaks",
 696        type=str,
 697        required=True,
 698        help="A peak regions file in ENCODE NarrowPeak format.",
 699    )
 700    extract_regions_bw_parser.add_argument(
 701        "-C",
 702        "--chrom-order",
 703        type=str,
 704        default=None,
 705        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 706    )
 707    extract_regions_bw_parser.add_argument(
 708        "-f",
 709        "--fasta",
 710        type=str,
 711        required=True,
 712        help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.",
 713    )
 714    extract_regions_bw_parser.add_argument(
 715        "-b",
 716        "--bigwigs",
 717        type=str,
 718        required=True,
 719        nargs="+",
 720        help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 721    )
 722
 723    extract_regions_bw_parser.add_argument(
 724        "-o",
 725        "--out-path",
 726        type=str,
 727        required=True,
 728        help="The path to the output .npz file.",
 729    )
 730
 731    extract_regions_bw_parser.add_argument(
 732        "-w",
 733        "--region-width",
 734        type=int,
 735        default=inspect.signature(extract_regions_bw)
 736        .parameters["region_width"]
 737        .default,
 738        help="The width of the input region centered around each peak summit.",
 739    )
 740
 741    extract_chrombpnet_regions_h5_parser = subparsers.add_parser(
 742        "extract-regions-chrombpnet-h5",
 743        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 744        help="Extract sequences and contributions from ChromBPNet contributions H5 files.",
 745    )
 746
 747    extract_chrombpnet_regions_h5_parser.add_argument(
 748        "-p",
 749        "--peaks",
 750        type=str,
 751        default=None,
 752        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 753    )
 754    extract_chrombpnet_regions_h5_parser.add_argument(
 755        "-C",
 756        "--chrom-order",
 757        type=str,
 758        default=None,
 759        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 760    )
 761
 762    extract_chrombpnet_regions_h5_parser.add_argument(
 763        "-c",
 764        "--h5s",
 765        type=str,
 766        required=True,
 767        nargs="+",
 768        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 769    )
 770
 771    extract_chrombpnet_regions_h5_parser.add_argument(
 772        "-o",
 773        "--out-path",
 774        type=str,
 775        required=True,
 776        help="The path to the output .npz file.",
 777    )
 778
 779    extract_chrombpnet_regions_h5_parser.add_argument(
 780        "-w",
 781        "--region-width",
 782        type=int,
 783        default=inspect.signature(extract_regions_chrombpnet_h5)
 784        .parameters["region_width"]
 785        .default,
 786        help="The width of the input region centered around each peak summit.",
 787    )
 788
 789    extract_regions_h5_parser = subparsers.add_parser(
 790        "extract-regions-h5",
 791        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 792        help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.",
 793    )
 794
 795    extract_regions_h5_parser.add_argument(
 796        "-p",
 797        "--peaks",
 798        type=str,
 799        default=None,
 800        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 801    )
 802    extract_regions_h5_parser.add_argument(
 803        "-C",
 804        "--chrom-order",
 805        type=str,
 806        default=None,
 807        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 808    )
 809
 810    extract_regions_h5_parser.add_argument(
 811        "-c",
 812        "--h5s",
 813        type=str,
 814        required=True,
 815        nargs="+",
 816        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 817    )
 818
 819    extract_regions_h5_parser.add_argument(
 820        "-o",
 821        "--out-path",
 822        type=str,
 823        required=True,
 824        help="The path to the output .npz file.",
 825    )
 826
 827    extract_regions_h5_parser.add_argument(
 828        "-w",
 829        "--region-width",
 830        type=int,
 831        default=inspect.signature(extract_regions_chrombpnet_h5)
 832        .parameters["region_width"]
 833        .default,
 834        help="The width of the input region centered around each peak summit.",
 835    )
 836
 837    extract_bpnet_regions_h5_parser = subparsers.add_parser(
 838        "extract-regions-bpnet-h5",
 839        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 840        help="Extract sequences and contributions from BPNet contributions H5 files.",
 841    )
 842
 843    extract_bpnet_regions_h5_parser.add_argument(
 844        "-p",
 845        "--peaks",
 846        type=str,
 847        default=None,
 848        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 849    )
 850    extract_bpnet_regions_h5_parser.add_argument(
 851        "-C",
 852        "--chrom-order",
 853        type=str,
 854        default=None,
 855        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 856    )
 857
 858    extract_bpnet_regions_h5_parser.add_argument(
 859        "-c",
 860        "--h5s",
 861        type=str,
 862        required=True,
 863        nargs="+",
 864        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 865    )
 866
 867    extract_bpnet_regions_h5_parser.add_argument(
 868        "-o",
 869        "--out-path",
 870        type=str,
 871        required=True,
 872        help="The path to the output .npz file.",
 873    )
 874
 875    extract_bpnet_regions_h5_parser.add_argument(
 876        "-w",
 877        "--region-width",
 878        type=int,
 879        default=inspect.signature(extract_regions_bpnet_h5)
 880        .parameters["region_width"]
 881        .default,
 882        help="The width of the input region centered around each peak summit.",
 883    )
 884
 885    extract_regions_modisco_fmt_parser = subparsers.add_parser(
 886        "extract-regions-modisco-fmt",
 887        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 888        help="Extract sequences and contributions from tfmodisco-lite input files.",
 889    )
 890
 891    extract_regions_modisco_fmt_parser.add_argument(
 892        "-p",
 893        "--peaks",
 894        type=str,
 895        default=None,
 896        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 897    )
 898    extract_regions_modisco_fmt_parser.add_argument(
 899        "-C",
 900        "--chrom-order",
 901        type=str,
 902        default=None,
 903        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 904    )
 905
 906    extract_regions_modisco_fmt_parser.add_argument(
 907        "-s",
 908        "--sequences",
 909        type=str,
 910        required=True,
 911        help="A .npy or .npz file containing one-hot encoded sequences.",
 912    )
 913
 914    extract_regions_modisco_fmt_parser.add_argument(
 915        "-a",
 916        "--attributions",
 917        type=str,
 918        required=True,
 919        nargs="+",
 920        help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 921    )
 922
 923    extract_regions_modisco_fmt_parser.add_argument(
 924        "-o",
 925        "--out-path",
 926        type=str,
 927        required=True,
 928        help="The path to the output .npz file.",
 929    )
 930
 931    extract_regions_modisco_fmt_parser.add_argument(
 932        "-w",
 933        "--region-width",
 934        type=int,
 935        default=inspect.signature(extract_regions_modisco_fmt)
 936        .parameters["region_width"]
 937        .default,
 938        help="The width of the input region centered around each peak summit.",
 939    )
 940
 941    call_hits_parser = subparsers.add_parser(
 942        "call-hits",
 943        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 944        help="Call hits on provided sequences, contributions, and motif CWM's.",
 945    )
 946
 947    call_hits_parser.add_argument(
 948        "-M",
 949        "--mode",
 950        type=str,
 951        default=inspect.signature(call_hits).parameters["mode"].default,
 952        choices={"pp", "ph", "hp", "hh"},
 953        help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.",
 954    )
 955
 956    call_hits_parser.add_argument(
 957        "-r",
 958        "--regions",
 959        type=str,
 960        required=True,
 961        help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.",
 962    )
 963    call_hits_parser.add_argument(
 964        "-m",
 965        "--modisco-h5",
 966        type=str,
 967        required=True,
 968        help="A tfmodisco-lite output H5 file of motif patterns.",
 969    )
 970
 971    call_hits_parser.add_argument(
 972        "-p",
 973        "--peaks",
 974        type=str,
 975        default=None,
 976        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 977    )
 978    call_hits_parser.add_argument(
 979        "-C",
 980        "--chrom-order",
 981        type=str,
 982        default=None,
 983        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 984    )
 985
 986    call_hits_parser.add_argument(
 987        "-I",
 988        "--motifs-include",
 989        type=str,
 990        default=None,
 991        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.",
 992    )
 993    call_hits_parser.add_argument(
 994        "-N",
 995        "--motif-names",
 996        type=str,
 997        default=None,
 998        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.",
 999    )
1000
1001    call_hits_parser.add_argument(
1002        "-o",
1003        "--out-dir",
1004        type=str,
1005        required=True,
1006        help="The path to the output directory.",
1007    )
1008
1009    call_hits_parser.add_argument(
1010        "-t",
1011        "--cwm-trim-threshold",
1012        type=float,
1013        default=inspect.signature(call_hits)
1014        .parameters["cwm_trim_threshold_default"]
1015        .default,
1016        help="The default threshold to determine motif start and end positions within the full CWMs.",
1017    )
1018    call_hits_parser.add_argument(
1019        "-T",
1020        "--cwm-trim-thresholds",
1021        type=str,
1022        default=None,
1023        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.",
1024    )
1025    call_hits_parser.add_argument(
1026        "-R",
1027        "--cwm-trim-coords",
1028        type=str,
1029        default=None,
1030        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.",
1031    )
1032
1033    call_hits_parser.add_argument(
1034        "-l",
1035        "--global-lambda",
1036        type=float,
1037        default=inspect.signature(call_hits).parameters["lambda_default"].default,
1038        help="The default L1 regularization weight determining the sparsity of hits.",
1039    )
1040    call_hits_parser.add_argument(
1041        "-L",
1042        "--motif-lambdas",
1043        type=str,
1044        default=None,
1045        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.",
1046    )
1047    call_hits_parser.add_argument(
1048        "-a",
1049        "--alpha",
1050        type=float,
1051        default=None,
1052        help="DEPRECATED: Please use the `--lambda` argument instead.",
1053    )
1054    call_hits_parser.add_argument(
1055        "-A",
1056        "--motif-alphas",
1057        type=str,
1058        default=None,
1059        help="DEPRECATED: Please use the `--motif-lambdas` argument instead.",
1060    )
1061
1062    call_hits_parser.add_argument(
1063        "-f",
1064        "--no-post-filter",
1065        action="store_true",
1066        help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.",
1067    )
1068    call_hits_parser.add_argument(
1069        "-q",
1070        "--sqrt-transform",
1071        action="store_true",
1072        help="Apply a signed square root transform to the input contributions and CWMs before hit calling.",
1073    )
1074    call_hits_parser.add_argument(
1075        "-s",
1076        "--step-size-max",
1077        type=float,
1078        default=inspect.signature(call_hits).parameters["step_size_max"].default,
1079        help="The maximum optimizer step size.",
1080    )
1081    call_hits_parser.add_argument(
1082        "-i",
1083        "--step-size-min",
1084        type=float,
1085        default=inspect.signature(call_hits).parameters["step_size_min"].default,
1086        help="The minimum optimizer step size.",
1087    )
1088    call_hits_parser.add_argument(
1089        "-j",
1090        "--step-adjust",
1091        type=float,
1092        default=inspect.signature(call_hits).parameters["step_adjust"].default,
1093        help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor",
1094    )
1095    call_hits_parser.add_argument(
1096        "-c",
1097        "--convergence-tol",
1098        type=float,
1099        default=inspect.signature(call_hits).parameters["convergence_tol"].default,
1100        help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.",
1101    )
1102    call_hits_parser.add_argument(
1103        "-S",
1104        "--max-steps",
1105        type=int,
1106        default=inspect.signature(call_hits).parameters["max_steps"].default,
1107        help="The maximum number of optimization steps.",
1108    )
1109    call_hits_parser.add_argument(
1110        "-b",
1111        "--batch-size",
1112        type=int,
1113        default=inspect.signature(call_hits).parameters["batch_size"].default,
1114        help="The batch size used for optimization.",
1115    )
1116    call_hits_parser.add_argument(
1117        "-d",
1118        "--device",
1119        type=str,
1120        default=None,
1121        help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.",
1122    )
1123    call_hits_parser.add_argument(
1124        "-J",
1125        "--compile",
1126        action="store_true",
1127        help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.",
1128    )
1129
1130    report_parser = subparsers.add_parser(
1131        "report",
1132        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1133        help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.",
1134    )
1135
1136    report_parser.add_argument(
1137        "-r",
1138        "--regions",
1139        type=str,
1140        required=True,
1141        help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.",
1142    )
1143    report_parser.add_argument(
1144        "-H",
1145        "--hits",
1146        type=str,
1147        required=True,
1148        help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.",
1149    )
1150    report_parser.add_argument(
1151        "-p",
1152        "--peaks",
1153        type=str,
1154        default=None,
1155        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
1156    )
1157    report_parser.add_argument(
1158        "-m",
1159        "--modisco-h5",
1160        type=str,
1161        default=None,
1162        help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.",
1163    )
1164    report_parser.add_argument(
1165        "-I",
1166        "--motifs-include",
1167        type=str,
1168        default=None,
1169        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1170    )
1171    report_parser.add_argument(
1172        "-N",
1173        "--motif-names",
1174        type=str,
1175        default=None,
1176        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1177    )
1178
1179    report_parser.add_argument(
1180        "-o",
1181        "--out-dir",
1182        type=str,
1183        required=True,
1184        help="The path to the report output directory.",
1185    )
1186
1187    report_parser.add_argument(
1188        "-W",
1189        "--modisco-region-width",
1190        type=int,
1191        default=inspect.signature(report).parameters["modisco_region_width"].default,
1192        help="The width of the region around each peak summit used by tfmodisco-lite.",
1193    )
1194    report_parser.add_argument(
1195        "-t",
1196        "--cwm-trim-threshold",
1197        type=float,
1198        default=inspect.signature(report).parameters["cwm_trim_threshold"].default,
1199        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1200    )
1201    report_parser.add_argument(
1202        "-n",
1203        "--no-recall",
1204        action="store_true",
1205        help="Do not compute motif recall metrics.",
1206    )
1207    report_parser.add_argument(
1208        "-s",
1209        "--no-seqlets",
1210        action="store_true",
1211        help="DEPRECATED: Please omit the `--modisco-h5` argument instead.",
1212    )
1213
1214    collapse_hits_parser = subparsers.add_parser(
1215        "collapse-hits",
1216        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1217        help="Identify best hit by motif similarity among sets of overlapping hits.",
1218    )
1219
1220    collapse_hits_parser.add_argument(
1221        "-i",
1222        "--hits",
1223        type=str,
1224        required=True,
1225        help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.",
1226    )
1227    collapse_hits_parser.add_argument(
1228        "-o",
1229        "--out-path",
1230        type=str,
1231        required=True,
1232        help='The path to the output .tsv file with an additional "is_primary" column.',
1233    )
1234    collapse_hits_parser.add_argument(
1235        "-O",
1236        "--overlap-frac",
1237        type=float,
1238        default=inspect.signature(collapse_hits).parameters["overlap_frac"].default,
1239        help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.",
1240    )
1241
1242    intersect_hits_parser = subparsers.add_parser(
1243        "intersect-hits",
1244        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1245        help="Intersect hits across multiple runs.",
1246    )
1247
1248    intersect_hits_parser.add_argument(
1249        "-i",
1250        "--hits",
1251        type=str,
1252        required=True,
1253        nargs="+",
1254        help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.",
1255    )
1256    intersect_hits_parser.add_argument(
1257        "-o",
1258        "--out-path",
1259        type=str,
1260        required=True,
1261        help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.",
1262    )
1263    intersect_hits_parser.add_argument(
1264        "-r",
1265        "--relaxed",
1266        action="store_true",
1267        help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.",
1268    )
1269
1270    args = parser.parse_args()
1271
1272    if args.cmd == "extract-regions-bw":
1273        extract_regions_bw(
1274            args.peaks,
1275            args.chrom_order,
1276            args.fasta,
1277            args.bigwigs,
1278            args.out_path,
1279            args.region_width,
1280        )
1281
1282    elif args.cmd == "extract-regions-chrombpnet-h5":
1283        extract_regions_chrombpnet_h5(
1284            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1285        )
1286
1287    elif args.cmd == "extract-regions-h5":
1288        print(
1289            "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead."
1290        )
1291        extract_regions_chrombpnet_h5(
1292            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1293        )
1294
1295    elif args.cmd == "extract-regions-bpnet-h5":
1296        extract_regions_bpnet_h5(
1297            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1298        )
1299
1300    elif args.cmd == "extract-regions-modisco-fmt":
1301        extract_regions_modisco_fmt(
1302            args.peaks,
1303            args.chrom_order,
1304            args.attributions,
1305            args.sequences,
1306            args.out_path,
1307            args.region_width,
1308        )
1309
1310    elif args.cmd == "call-hits":
1311        if args.alpha is not None:
1312            warnings.warn(
1313                "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead."
1314            )
1315            args.global_lambda = args.alpha
1316        if args.motif_alphas is not None:
1317            warnings.warn(
1318                "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead."
1319            )
1320            args.motif_lambdas = args.motif_alphas
1321
1322        call_hits(
1323            args.regions,
1324            args.peaks,
1325            args.modisco_h5,
1326            args.chrom_order,
1327            args.motifs_include,
1328            args.motif_names,
1329            args.motif_lambdas,
1330            args.out_dir,
1331            args.cwm_trim_coords,
1332            args.cwm_trim_thresholds,
1333            args.cwm_trim_threshold,
1334            args.global_lambda,
1335            args.step_size_max,
1336            args.step_size_min,
1337            args.sqrt_transform,
1338            args.convergence_tol,
1339            args.max_steps,
1340            args.batch_size,
1341            args.step_adjust,
1342            args.device,
1343            args.mode,
1344            args.no_post_filter,
1345            args.compile,
1346        )
1347
1348    elif args.cmd == "report":
1349        report(
1350            args.regions,
1351            args.hits,
1352            args.modisco_h5,
1353            args.peaks,
1354            args.motifs_include,
1355            args.motif_names,
1356            args.out_dir,
1357            args.modisco_region_width,
1358            args.cwm_trim_threshold,
1359            not args.no_recall,
1360            not args.no_seqlets,
1361        )
1362
1363    elif args.cmd == "collapse-hits":
1364        collapse_hits(args.hits, args.out_path, args.overlap_frac)
1365
1366    elif args.cmd == "intersect-hits":
1367        intersect_hits(args.hits, args.out_path, args.relaxed)

Command-line interface for the Fi-NeMo motif instance calling pipeline.

This function provides the main entry point for all Fi-NeMo operations including:

  • Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
  • Motif hit calling using the Fi-NeMo algorithm
  • Report generation and visualization
  • Post-processing operations (hit collapsing, intersection)