finemo.main

Main CLI module for the Fi-NeMo motif instance calling pipeline.

This module provides the command-line interface for all Fi-NeMo operations:

  • Data preprocessing from various genomic formats
  • Motif hit calling using the Fi-NeMo algorithm
  • Report generation and result visualization
  • Post-processing operations (hit collapsing, intersection)

The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet), and TF-MoDISco format.

   1"""Main CLI module for the Fi-NeMo motif instance calling pipeline.
   2
   3This module provides the command-line interface for all Fi-NeMo operations:
   4- Data preprocessing from various genomic formats
   5- Motif hit calling using the Fi-NeMo algorithm
   6- Report generation and result visualization
   7- Post-processing operations (hit collapsing, intersection)
   8
   9The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet),
  10and TF-MoDISco format.
  11"""
  12
  13from . import data_io
  14
  15import os
  16import argparse
  17import warnings
  18import inspect
  19from typing import Optional, List
  20
  21import polars as pl
  22
  23
  24def extract_regions_bw(
  25    peaks_path: str,
  26    chrom_order_path: Optional[str],
  27    fa_path: str,
  28    bw_paths: List[str],
  29    out_path: str,
  30    region_width: int = 1000,
  31) -> None:
  32    """Extract genomic regions and contribution scores from bigWig and FASTA files.
  33
  34    Parameters
  35    ----------
  36    peaks_path : str
  37        Path to ENCODE NarrowPeak format file.
  38    chrom_order_path : str, optional
  39        Path to chromosome ordering file.
  40    fa_path : str
  41        Path to genome FASTA file.
  42    bw_paths : List[str]
  43        List of bigWig file paths containing contribution scores.
  44    out_path : str
  45        Output path for NPZ file.
  46    region_width : int, default 1000
  47        Width of regions to extract around peak summits.
  48
  49    Notes
  50    -----
  51    BigWig files only provide projected contribution scores.
  52    """
  53    half_width = region_width // 2
  54
  55    # Load peak regions and extract sequences/contributions
  56    peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
  57    sequences, contribs = data_io.load_regions_from_bw(
  58        peaks_df, fa_path, bw_paths, half_width
  59    )
  60
  61    # Save processed data to NPZ format
  62    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
  63
  64
  65def extract_regions_chrombpnet_h5(
  66    peaks_path: Optional[str],
  67    chrom_order_path: Optional[str],
  68    h5_paths: List[str],
  69    out_path: str,
  70    region_width: int = 1000,
  71) -> None:
  72    """Extract genomic regions and contribution scores from ChromBPNet HDF5 files.
  73
  74    Parameters
  75    ----------
  76    peaks_path : str, optional
  77        Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
  78    chrom_order_path : str, optional
  79        Path to chromosome ordering file.
  80    h5_paths : List[str]
  81        List of ChromBPNet HDF5 file paths.
  82    out_path : str
  83        Output path for NPZ file.
  84    region_width : int, default 1000
  85        Width of regions to extract around peak summits.
  86    """
  87    half_width = region_width // 2
  88
  89    if peaks_path is not None:
  90        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
  91    else:
  92        peaks_df = None
  93
  94    sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width)
  95
  96    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
  97
  98
  99def extract_regions_bpnet_h5(
 100    peaks_path: Optional[str],
 101    chrom_order_path: Optional[str],
 102    h5_paths: List[str],
 103    out_path: str,
 104    region_width: int = 1000,
 105) -> None:
 106    """Extract genomic regions and contribution scores from BPNet HDF5 files.
 107
 108    Parameters
 109    ----------
 110    peaks_path : str, optional
 111        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
 112    chrom_order_path : str, optional
 113        Path to chromosome ordering file.
 114    h5_paths : List[str]
 115        List of BPNet HDF5 file paths.
 116    out_path : str
 117        Output path for NPZ file.
 118    region_width : int, default 1000
 119        Width of regions to extract around peak summits.
 120    """
 121    half_width = region_width // 2
 122
 123    if peaks_path is not None:
 124        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 125    else:
 126        peaks_df = None
 127
 128    sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width)
 129
 130    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
 131
 132
 133def extract_regions_modisco_fmt(
 134    peaks_path: Optional[str],
 135    chrom_order_path: Optional[str],
 136    shaps_paths: List[str],
 137    ohe_path: str,
 138    out_path: str,
 139    region_width: int = 1000,
 140) -> None:
 141    """Extract genomic regions and contribution scores from TF-MoDISco format files.
 142
 143    Parameters
 144    ----------
 145    peaks_path : str, optional
 146        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
 147    chrom_order_path : str, optional
 148        Path to chromosome ordering file.
 149    shaps_paths : List[str]
 150        List of paths to .npy/.npz files containing SHAP/attribution scores.
 151    ohe_path : str
 152        Path to .npy/.npz file containing one-hot encoded sequences.
 153    out_path : str
 154        Output path for NPZ file.
 155    region_width : int, default 1000
 156        Width of regions to extract around peak summits.
 157    """
 158    half_width = region_width // 2
 159
 160    if peaks_path is not None:
 161        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 162    else:
 163        peaks_df = None
 164
 165    sequences, contribs = data_io.load_regions_from_modisco_fmt(
 166        shaps_paths, ohe_path, half_width
 167    )
 168
 169    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)
 170
 171
 172def call_hits(
 173    regions_path: str,
 174    peaks_path: Optional[str],
 175    modisco_h5_path: str,
 176    chrom_order_path: Optional[str],
 177    motifs_include_path: Optional[str],
 178    motif_names_path: Optional[str],
 179    motif_lambdas_path: Optional[str],
 180    out_dir: str,
 181    cwm_trim_coords_path: Optional[str] = None,
 182    cwm_trim_thresholds_path: Optional[str] = None,
 183    cwm_trim_threshold_default: float = 0.3,
 184    lambda_default: float = 0.7,
 185    step_size_max: float = 3.0,
 186    step_size_min: float = 0.08,
 187    sqrt_transform: bool = False,
 188    convergence_tol: float = 0.0005,
 189    max_steps: int = 10000,
 190    batch_size: int = 2000,
 191    step_adjust: float = 0.7,
 192    device: Optional[str] = None,
 193    mode: str = "pp",
 194    no_post_filter: bool = False,
 195    compile_optimizer: bool = False,
 196) -> None:
 197    """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.
 198
 199    This function implements the core Fi-NeMo hit calling pipeline, which identifies
 200    motif instances by solving a sparse reconstruction problem using proximal gradient
 201    descent. The algorithm represents contribution scores as weighted combinations of
 202    motif CWMs at specific positions.
 203
 204    Parameters
 205    ----------
 206    regions_path : str
 207        Path to NPZ file containing preprocessed regions (sequences, contributions,
 208        and optional peak coordinates).
 209    peaks_path : str, optional
 210        DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be
 211        included during preprocessing instead.
 212    modisco_h5_path : str
 213        Path to TF-MoDISco H5 file containing motif CWMs.
 214    chrom_order_path : str, optional
 215        DEPRECATED. Path to chromosome ordering file.
 216    motifs_include_path : str, optional
 217        Path to file listing motif names to include in analysis.
 218    motif_names_path : str, optional
 219        Path to file mapping motif IDs to custom names.
 220    motif_lambdas_path : str, optional
 221        Path to file specifying per-motif lambda values.
 222    out_dir : str
 223        Output directory for results.
 224    cwm_trim_coords_path : str, optional
 225        Path to file specifying custom motif trimming coordinates.
 226    cwm_trim_thresholds_path : str, optional
 227        Path to file specifying custom motif trimming thresholds.
 228    cwm_trim_threshold_default : float, default 0.3
 229        Default threshold for motif trimming.
 230    lambda_default : float, default 0.7
 231        Default L1 regularization weight.
 232    step_size_max : float, default 3.0
 233        Maximum optimization step size.
 234    step_size_min : float, default 0.08
 235        Minimum optimization step size.
 236    sqrt_transform : bool, default False
 237        Whether to apply signed square root transform to contributions.
 238    convergence_tol : float, default 0.0005
 239        Convergence tolerance for duality gap.
 240    max_steps : int, default 10000
 241        Maximum number of optimization steps.
 242    batch_size : int, default 2000
 243        Batch size for GPU processing.
 244    step_adjust : float, default 0.7
 245        Step size adjustment factor on divergence.
 246    device : str, optional
 247        DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
 248    mode : str, default "pp"
 249        Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
 250    no_post_filter : bool, default False
 251        If True, skip post-hit-calling similarity filtering.
 252    compile_optimizer : bool, default False
 253        Whether to JIT-compile the optimizer for speed.
 254
 255    Notes
 256    -----
 257    The Fi-NeMo algorithm solves the optimization problem:
 258    minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁
 259    subject to: c ≥ 0
 260
 261    where c represents motif hit coefficients and reconstruction uses convolution
 262    with motif CWMs.
 263    """
 264
 265    params = locals()
 266    import torch
 267    from . import hitcaller
 268
 269    if device is not None:
 270        warnings.warn(
 271            "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device."
 272        )
 273
 274    sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path)
 275
 276    region_width = sequences.shape[2]
 277    if region_width % 2 != 0:
 278        raise ValueError(f"Region width of {region_width} is not divisible by 2.")
 279
 280    half_width = region_width // 2
 281    num_regions = contribs.shape[0]
 282
 283    if peaks_path is not None:
 284        warnings.warn(
 285            "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
 286        )
 287        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
 288        has_peaks = True
 289
 290    if not has_peaks:
 291        warnings.warn(
 292            "No peak region data provided. Output hits will lack absolute genomic coordinates."
 293        )
 294
 295    if mode == "pp":
 296        motif_type = "cwm"
 297        use_hypothetical_contribs = False
 298    elif mode == "ph":
 299        motif_type = "cwm"
 300        use_hypothetical_contribs = True
 301    elif mode == "hp":
 302        motif_type = "hcwm"
 303        use_hypothetical_contribs = False
 304    elif mode == "hh":
 305        motif_type = "hcwm"
 306        use_hypothetical_contribs = True
 307    else:
 308        raise ValueError(
 309            f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'."
 310        )
 311
 312    if motifs_include_path is not None:
 313        motifs_include = data_io.load_txt(motifs_include_path)
 314    else:
 315        motifs_include = None
 316
 317    if motif_names_path is not None:
 318        motif_name_map = data_io.load_mapping(motif_names_path, str)
 319    else:
 320        motif_name_map = None
 321
 322    if motif_lambdas_path is not None:
 323        motif_lambdas = data_io.load_mapping(motif_lambdas_path, float)
 324    else:
 325        motif_lambdas = None
 326
 327    if cwm_trim_coords_path is not None:
 328        trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int)
 329    else:
 330        trim_coords = None
 331
 332    if cwm_trim_thresholds_path is not None:
 333        trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float)
 334    else:
 335        trim_thresholds = None
 336
 337    motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs(
 338        modisco_h5_path,
 339        trim_coords,
 340        trim_thresholds,
 341        cwm_trim_threshold_default,
 342        motif_type,
 343        motifs_include,
 344        motif_name_map,
 345        motif_lambdas,
 346        lambda_default,
 347        True,
 348    )
 349    num_motifs = cwms.shape[0]
 350    motif_width = cwms.shape[2]
 351    lambdas = motifs_df.get_column("lambda").to_numpy(writable=True)
 352
 353    device_obj = torch.device(device) if device is not None else None
 354    hits_df, qc_df = hitcaller.fit_contribs(
 355        cwms,
 356        contribs,
 357        sequences,
 358        trim_masks,
 359        use_hypothetical_contribs,
 360        lambdas,
 361        step_size_max,
 362        step_size_min,
 363        sqrt_transform,
 364        convergence_tol,
 365        max_steps,
 366        batch_size,
 367        step_adjust,
 368        not no_post_filter,
 369        device_obj,
 370        compile_optimizer,
 371    )
 372
 373    os.makedirs(out_dir, exist_ok=True)
 374    out_path_qc = os.path.join(out_dir, "peaks_qc.tsv")
 375    data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width)
 376    data_io.write_qc(qc_df, peaks_df, out_path_qc)
 377
 378    out_path_motif_df = os.path.join(out_dir, "motif_data.tsv")
 379    data_io.write_motifs_df(motifs_df, out_path_motif_df)
 380
 381    out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy")
 382    data_io.write_motif_cwms(cwms, out_path_motif_cwms)
 383
 384    params |= {
 385        "region_width": region_width,
 386        "num_regions": num_regions,
 387        "untrimmed_motif_width": motif_width,
 388        "num_motifs": num_motifs,
 389    }
 390
 391    out_path_params = os.path.join(out_dir, "parameters.json")
 392    data_io.write_params(params, out_path_params)
 393
 394
 395def report(
 396    regions_path: str,
 397    hits_dir: str,
 398    modisco_h5_path: Optional[str],
 399    peaks_path: Optional[str],
 400    motifs_include_path: Optional[str],
 401    motif_names_path: Optional[str],
 402    out_dir: str,
 403    modisco_region_width: int = 400,
 404    cwm_trim_threshold: float = 0.3,
 405    compute_recall: bool = True,
 406    use_seqlets: bool = True,
 407) -> None:
 408    """Generate comprehensive HTML report with statistics and visualizations.
 409
 410    This function creates detailed analysis reports comparing Fi-NeMo hit calling
 411    results with TF-MoDISco seqlets, including performance metrics, distribution
 412    plots, and motif visualization. The report provides insights into hit calling
 413    quality and motif discovery accuracy.
 414
 415    Parameters
 416    ----------
 417    regions_path : str
 418        Path to NPZ file containing the same regions used for hit calling.
 419    hits_dir : str
 420        Path to directory containing Fi-NeMo hit calling outputs.
 421    modisco_h5_path : str, optional
 422        Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
 423    peaks_path : str, optional
 424        DEPRECATED. Peak coordinates should be included in regions file.
 425    motifs_include_path : str, optional
 426        DEPRECATED. This information is inferred from hit calling outputs.
 427    motif_names_path : str, optional
 428        DEPRECATED. This information is inferred from hit calling outputs.
 429    out_dir : str
 430        Output directory for report files.
 431    modisco_region_width : int, default 400
 432        Width of regions used by TF-MoDISco (needed for coordinate conversion).
 433    cwm_trim_threshold : float, default 0.3
 434        DEPRECATED. This information is inferred from hit calling outputs.
 435    compute_recall : bool, default True
 436        Whether to compute recall metrics against TF-MoDISco seqlets.
 437    use_seqlets : bool, default True
 438        Whether to include seqlet-based comparisons in the report.
 439
 440    Notes
 441    -----
 442    The generated report includes:
 443    - Hit vs seqlet count comparisons
 444    - Motif CWM visualizations
 445    - Hit statistic distributions
 446    - Co-occurrence heatmaps
 447    - Confusion matrices for overlapping motifs
 448    """
 449    from . import evaluation, visualization
 450
 451    sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path)
 452    if len(contribs.shape) == 3:
 453        regions = contribs * sequences
 454    elif len(contribs.shape) == 2:
 455        regions = contribs[:, None, :] * sequences
 456    else:
 457        raise ValueError(f"Unexpected contribs shape: {contribs.shape}")
 458
 459    half_width = regions.shape[2] // 2
 460    modisco_half_width = modisco_region_width // 2
 461
 462    if peaks_path is not None:
 463        warnings.warn(
 464            "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
 465        )
 466        peaks_df = data_io.load_peaks(peaks_path, None, half_width)
 467
 468    if hits_dir.endswith(".tsv"):
 469        warnings.warn(
 470            "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead."
 471        )
 472
 473        hits_path = hits_dir
 474
 475        hits_df = data_io.load_hits(hits_path, lazy=True)
 476
 477        if motifs_include_path is not None:
 478            motifs_include = data_io.load_txt(motifs_include_path)
 479        else:
 480            motifs_include = None
 481
 482        if motif_names_path is not None:
 483            motif_name_map = data_io.load_mapping(motif_names_path, str)
 484        else:
 485            motif_name_map = None
 486
 487        if modisco_h5_path is not None:
 488            motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs(
 489                modisco_h5_path,
 490                None,
 491                None,
 492                cwm_trim_threshold,
 493                "cwm",
 494                motifs_include,
 495                motif_name_map,
 496                None,
 497                1.0,
 498                True,
 499            )
 500        else:
 501            # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs
 502            # This will cause an error later, but that's expected behavior
 503            raise ValueError(
 504                "modisco_h5_path is required when providing a hits.tsv file directly"
 505            )
 506
 507    else:
 508        hits_df_path = os.path.join(hits_dir, "hits.tsv")
 509        hits_df = data_io.load_hits(hits_df_path, lazy=True)
 510
 511        motifs_df_path = os.path.join(hits_dir, "motif_data.tsv")
 512        motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path)
 513
 514        cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy")
 515        cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path)
 516
 517        params_path = os.path.join(hits_dir, "parameters.json")
 518        params = data_io.load_params(params_path)
 519        cwm_trim_threshold = params["cwm_trim_threshold_default"]
 520
 521    if not use_seqlets:
 522        warnings.warn(
 523            "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead."
 524        )
 525        seqlets_df = None
 526    elif modisco_h5_path is None:
 527        compute_recall = False
 528        seqlets_df = None
 529    else:
 530        seqlets_df = data_io.load_modisco_seqlets(
 531            modisco_h5_path,
 532            peaks_df,
 533            motifs_df,
 534            half_width,
 535            modisco_half_width,
 536            lazy=True,
 537        )
 538
 539    motif_width = cwms_modisco.shape[2]
 540
 541    # Convert to LazyFrame if needed and ensure motif_names is a list
 542    if isinstance(hits_df, pl.LazyFrame):
 543        hits_df_lazy: pl.LazyFrame = hits_df
 544    else:
 545        hits_df_lazy: pl.LazyFrame = hits_df.lazy()
 546
 547    motif_names_list: List[str] = list(motif_names)
 548
 549    occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list)
 550
 551    report_data, report_df, motifs, trim_bounds = evaluation.tfmodisco_comparison(
 552        regions,
 553        sequences,
 554        hits_df,
 555        peaks_df,
 556        seqlets_df,
 557        motifs_df,
 558        cwms_modisco,
 559        motif_names_list,
 560        modisco_half_width,
 561        motif_width,
 562        compute_recall,
 563    )
 564
 565    if seqlets_df is not None:
 566        confusion_df, confusion_mat = evaluation.seqlet_confusion(
 567            hits_df, seqlets_df, peaks_df, motif_names_list, motif_width
 568        )
 569    else:
 570        confusion_df, confusion_mat = None, None
 571
 572    os.makedirs(out_dir, exist_ok=True)
 573
 574    occ_path = os.path.join(out_dir, "motif_occurrences.tsv")
 575    data_io.write_occ_df(occ_df, occ_path)
 576
 577    data_io.write_report_data(report_df, motifs, out_dir)
 578
 579    visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir)
 580    visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir)
 581    visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir)
 582
 583    plot_dir = os.path.join(out_dir, "motifs")
 584    visualization.plot_motifs(motifs, trim_bounds, plot_dir)
 585
 586    if seqlets_df is not None:
 587        seqlets_collected = (
 588            seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df
 589        )
 590        seqlets_path = os.path.join(out_dir, "seqlets.tsv")
 591        data_io.write_modisco_seqlets(seqlets_collected, seqlets_path)
 592
 593        if confusion_df is not None and confusion_mat is not None:
 594            seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv")
 595            data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path)
 596
 597            visualization.plot_hit_vs_seqlet_counts(report_data, out_dir)
 598            visualization.plot_seqlet_confusion_heatmap(
 599                confusion_mat, motif_names_list, out_dir
 600            )
 601
 602    report_path = os.path.join(out_dir, "report.html")
 603    visualization.write_report(
 604        report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None
 605    )
 606
 607
 608def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
 609    """Collapse overlapping hits by selecting the best hit per overlapping group.
 610
 611    This function processes a set of motif hits and identifies overlapping hits,
 612    keeping only the hit with the highest similarity score within each overlapping
 613    group. This reduces redundancy in hit calls while preserving the most confident
 614    predictions.
 615
 616    Parameters
 617    ----------
 618    hits_path : str
 619        Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
 620    out_path : str
 621        Path to output TSV file with additional 'is_primary' column.
 622    overlap_frac : float, default 0.2
 623        Minimum fractional overlap for considering hits as overlapping.
 624        For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
 625
 626    Notes
 627    -----
 628    The algorithm uses a sweep line approach with a heap data structure to
 629    efficiently identify overlapping intervals and select the best hit based
 630    on similarity scores.
 631    """
 632    from . import postprocessing
 633
 634    hits_df = data_io.load_hits(hits_path, lazy=False)
 635    hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac)
 636
 637    data_io.write_hits_processed(
 638        hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES
 639    )
 640
 641
 642def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
 643    """Find intersection of hits across multiple Fi-NeMo runs.
 644
 645    This function identifies motif hits that are consistently called across
 646    multiple independent runs, providing a way to assess reproducibility and
 647    identify high-confidence hits.
 648
 649    Parameters
 650    ----------
 651    hits_paths : List[str]
 652        List of paths to input TSV files from different runs.
 653    out_path : str
 654        Path to output TSV file containing intersection results.
 655        Duplicate columns are suffixed with run index.
 656    relaxed : bool, default False
 657        If True, uses relaxed intersection criteria based only on motif names
 658        and untrimmed coordinates. If False, assumes consistent region definitions
 659        and motif trimming across runs.
 660
 661    Notes
 662    -----
 663    The strict intersection mode requires consistent input regions and motif
 664    processing parameters across all runs. The relaxed mode is more permissive
 665    but may not be suitable when genomic coordinates are unavailable.
 666    """
 667    from . import postprocessing
 668
 669    hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths]
 670    hits_df = postprocessing.intersect_hits(hits_dfs, relaxed)
 671
 672    data_io.write_hits_processed(hits_df, out_path, schema=None)
 673
 674
 675def cli() -> None:
 676    """Command-line interface for the Fi-NeMo motif instance calling pipeline.
 677
 678    This function provides the main entry point for all Fi-NeMo operations including:
 679    - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
 680    - Motif hit calling using the Fi-NeMo algorithm
 681    - Report generation and visualization
 682    - Post-processing operations (hit collapsing, intersection)
 683    """
 684    parser = argparse.ArgumentParser()
 685    subparsers = parser.add_subparsers(required=True, dest="cmd")
 686
 687    extract_regions_bw_parser = subparsers.add_parser(
 688        "extract-regions-bw",
 689        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 690        help="Extract sequences and contributions from FASTA and bigwig files.",
 691    )
 692
 693    extract_regions_bw_parser.add_argument(
 694        "-p",
 695        "--peaks",
 696        type=str,
 697        required=True,
 698        help="A peak regions file in ENCODE NarrowPeak format.",
 699    )
 700    extract_regions_bw_parser.add_argument(
 701        "-C",
 702        "--chrom-order",
 703        type=str,
 704        default=None,
 705        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 706    )
 707    extract_regions_bw_parser.add_argument(
 708        "-f",
 709        "--fasta",
 710        type=str,
 711        required=True,
 712        help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.",
 713    )
 714    extract_regions_bw_parser.add_argument(
 715        "-b",
 716        "--bigwigs",
 717        type=str,
 718        required=True,
 719        nargs="+",
 720        help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 721    )
 722
 723    extract_regions_bw_parser.add_argument(
 724        "-o",
 725        "--out-path",
 726        type=str,
 727        required=True,
 728        help="The path to the output .npz file.",
 729    )
 730
 731    extract_regions_bw_parser.add_argument(
 732        "-w",
 733        "--region-width",
 734        type=int,
 735        default=inspect.signature(extract_regions_bw)
 736        .parameters["region_width"]
 737        .default,
 738        help="The width of the input region centered around each peak summit.",
 739    )
 740
 741    extract_chrombpnet_regions_h5_parser = subparsers.add_parser(
 742        "extract-regions-chrombpnet-h5",
 743        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 744        help="Extract sequences and contributions from ChromBPNet contributions H5 files.",
 745    )
 746
 747    extract_chrombpnet_regions_h5_parser.add_argument(
 748        "-p",
 749        "--peaks",
 750        type=str,
 751        default=None,
 752        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 753    )
 754    extract_chrombpnet_regions_h5_parser.add_argument(
 755        "-C",
 756        "--chrom-order",
 757        type=str,
 758        default=None,
 759        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 760    )
 761
 762    extract_chrombpnet_regions_h5_parser.add_argument(
 763        "-c",
 764        "--h5s",
 765        type=str,
 766        required=True,
 767        nargs="+",
 768        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 769    )
 770
 771    extract_chrombpnet_regions_h5_parser.add_argument(
 772        "-o",
 773        "--out-path",
 774        type=str,
 775        required=True,
 776        help="The path to the output .npz file.",
 777    )
 778
 779    extract_chrombpnet_regions_h5_parser.add_argument(
 780        "-w",
 781        "--region-width",
 782        type=int,
 783        default=inspect.signature(extract_regions_chrombpnet_h5)
 784        .parameters["region_width"]
 785        .default,
 786        help="The width of the input region centered around each peak summit.",
 787    )
 788
 789    extract_regions_h5_parser = subparsers.add_parser(
 790        "extract-regions-h5",
 791        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 792        help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.",
 793    )
 794
 795    extract_regions_h5_parser.add_argument(
 796        "-p",
 797        "--peaks",
 798        type=str,
 799        default=None,
 800        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 801    )
 802    extract_regions_h5_parser.add_argument(
 803        "-C",
 804        "--chrom-order",
 805        type=str,
 806        default=None,
 807        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 808    )
 809
 810    extract_regions_h5_parser.add_argument(
 811        "-c",
 812        "--h5s",
 813        type=str,
 814        required=True,
 815        nargs="+",
 816        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 817    )
 818
 819    extract_regions_h5_parser.add_argument(
 820        "-o",
 821        "--out-path",
 822        type=str,
 823        required=True,
 824        help="The path to the output .npz file.",
 825    )
 826
 827    extract_regions_h5_parser.add_argument(
 828        "-w",
 829        "--region-width",
 830        type=int,
 831        default=inspect.signature(extract_regions_chrombpnet_h5)
 832        .parameters["region_width"]
 833        .default,
 834        help="The width of the input region centered around each peak summit.",
 835    )
 836
 837    extract_bpnet_regions_h5_parser = subparsers.add_parser(
 838        "extract-regions-bpnet-h5",
 839        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 840        help="Extract sequences and contributions from BPNet contributions H5 files.",
 841    )
 842
 843    extract_bpnet_regions_h5_parser.add_argument(
 844        "-p",
 845        "--peaks",
 846        type=str,
 847        default=None,
 848        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 849    )
 850    extract_bpnet_regions_h5_parser.add_argument(
 851        "-C",
 852        "--chrom-order",
 853        type=str,
 854        default=None,
 855        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 856    )
 857
 858    extract_bpnet_regions_h5_parser.add_argument(
 859        "-c",
 860        "--h5s",
 861        type=str,
 862        required=True,
 863        nargs="+",
 864        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 865    )
 866
 867    extract_bpnet_regions_h5_parser.add_argument(
 868        "-o",
 869        "--out-path",
 870        type=str,
 871        required=True,
 872        help="The path to the output .npz file.",
 873    )
 874
 875    extract_bpnet_regions_h5_parser.add_argument(
 876        "-w",
 877        "--region-width",
 878        type=int,
 879        default=inspect.signature(extract_regions_bpnet_h5)
 880        .parameters["region_width"]
 881        .default,
 882        help="The width of the input region centered around each peak summit.",
 883    )
 884
 885    extract_regions_modisco_fmt_parser = subparsers.add_parser(
 886        "extract-regions-modisco-fmt",
 887        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 888        help="Extract sequences and contributions from tfmodisco-lite input files.",
 889    )
 890
 891    extract_regions_modisco_fmt_parser.add_argument(
 892        "-p",
 893        "--peaks",
 894        type=str,
 895        default=None,
 896        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 897    )
 898    extract_regions_modisco_fmt_parser.add_argument(
 899        "-C",
 900        "--chrom-order",
 901        type=str,
 902        default=None,
 903        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 904    )
 905
 906    extract_regions_modisco_fmt_parser.add_argument(
 907        "-s",
 908        "--sequences",
 909        type=str,
 910        required=True,
 911        help="A .npy or .npz file containing one-hot encoded sequences.",
 912    )
 913
 914    extract_regions_modisco_fmt_parser.add_argument(
 915        "-a",
 916        "--attributions",
 917        type=str,
 918        required=True,
 919        nargs="+",
 920        help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 921    )
 922
 923    extract_regions_modisco_fmt_parser.add_argument(
 924        "-o",
 925        "--out-path",
 926        type=str,
 927        required=True,
 928        help="The path to the output .npz file.",
 929    )
 930
 931    extract_regions_modisco_fmt_parser.add_argument(
 932        "-w",
 933        "--region-width",
 934        type=int,
 935        default=inspect.signature(extract_regions_modisco_fmt)
 936        .parameters["region_width"]
 937        .default,
 938        help="The width of the input region centered around each peak summit.",
 939    )
 940
 941    call_hits_parser = subparsers.add_parser(
 942        "call-hits",
 943        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 944        help="Call hits on provided sequences, contributions, and motif CWM's.",
 945    )
 946
 947    call_hits_parser.add_argument(
 948        "-M",
 949        "--mode",
 950        type=str,
 951        default=inspect.signature(call_hits).parameters["mode"].default,
 952        choices={"pp", "ph", "hp", "hh"},
 953        help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.",
 954    )
 955
 956    call_hits_parser.add_argument(
 957        "-r",
 958        "--regions",
 959        type=str,
 960        required=True,
 961        help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.",
 962    )
 963    call_hits_parser.add_argument(
 964        "-m",
 965        "--modisco-h5",
 966        type=str,
 967        required=True,
 968        help="A tfmodisco-lite output H5 file of motif patterns.",
 969    )
 970
 971    call_hits_parser.add_argument(
 972        "-p",
 973        "--peaks",
 974        type=str,
 975        default=None,
 976        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 977    )
 978    call_hits_parser.add_argument(
 979        "-C",
 980        "--chrom-order",
 981        type=str,
 982        default=None,
 983        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 984    )
 985
 986    call_hits_parser.add_argument(
 987        "-I",
 988        "--motifs-include",
 989        type=str,
 990        default=None,
 991        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.",
 992    )
 993    call_hits_parser.add_argument(
 994        "-N",
 995        "--motif-names",
 996        type=str,
 997        default=None,
 998        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.",
 999    )
1000
1001    call_hits_parser.add_argument(
1002        "-o",
1003        "--out-dir",
1004        type=str,
1005        required=True,
1006        help="The path to the output directory.",
1007    )
1008
1009    call_hits_parser.add_argument(
1010        "-t",
1011        "--cwm-trim-threshold",
1012        type=float,
1013        default=inspect.signature(call_hits)
1014        .parameters["cwm_trim_threshold_default"]
1015        .default,
1016        help="The default threshold to determine motif start and end positions within the full CWMs.",
1017    )
1018    call_hits_parser.add_argument(
1019        "-T",
1020        "--cwm-trim-thresholds",
1021        type=str,
1022        default=None,
1023        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.",
1024    )
1025    call_hits_parser.add_argument(
1026        "-R",
1027        "--cwm-trim-coords",
1028        type=str,
1029        default=None,
1030        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.",
1031    )
1032
1033    call_hits_parser.add_argument(
1034        "-l",
1035        "--global-lambda",
1036        type=float,
1037        default=inspect.signature(call_hits).parameters["lambda_default"].default,
1038        help="The default L1 regularization weight determining the sparsity of hits.",
1039    )
1040    call_hits_parser.add_argument(
1041        "-L",
1042        "--motif-lambdas",
1043        type=str,
1044        default=None,
1045        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.",
1046    )
1047    call_hits_parser.add_argument(
1048        "-a",
1049        "--alpha",
1050        type=float,
1051        default=None,
1052        help="DEPRECATED: Please use the `--lambda` argument instead.",
1053    )
1054    call_hits_parser.add_argument(
1055        "-A",
1056        "--motif-alphas",
1057        type=str,
1058        default=None,
1059        help="DEPRECATED: Please use the `--motif-lambdas` argument instead.",
1060    )
1061
1062    call_hits_parser.add_argument(
1063        "-f",
1064        "--no-post-filter",
1065        action="store_true",
1066        help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.",
1067    )
1068    call_hits_parser.add_argument(
1069        "-q",
1070        "--sqrt-transform",
1071        action="store_true",
1072        help="Apply a signed square root transform to the input contributions and CWMs before hit calling.",
1073    )
1074    call_hits_parser.add_argument(
1075        "-s",
1076        "--step-size-max",
1077        type=float,
1078        default=inspect.signature(call_hits).parameters["step_size_max"].default,
1079        help="The maximum optimizer step size.",
1080    )
1081    call_hits_parser.add_argument(
1082        "-i",
1083        "--step-size-min",
1084        type=float,
1085        default=inspect.signature(call_hits).parameters["step_size_min"].default,
1086        help="The minimum optimizer step size.",
1087    )
1088    call_hits_parser.add_argument(
1089        "-j",
1090        "--step-adjust",
1091        type=float,
1092        default=inspect.signature(call_hits).parameters["step_adjust"].default,
1093        help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor",
1094    )
1095    call_hits_parser.add_argument(
1096        "-c",
1097        "--convergence-tol",
1098        type=float,
1099        default=inspect.signature(call_hits).parameters["convergence_tol"].default,
1100        help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.",
1101    )
1102    call_hits_parser.add_argument(
1103        "-S",
1104        "--max-steps",
1105        type=int,
1106        default=inspect.signature(call_hits).parameters["max_steps"].default,
1107        help="The maximum number of optimization steps.",
1108    )
1109    call_hits_parser.add_argument(
1110        "-b",
1111        "--batch-size",
1112        type=int,
1113        default=inspect.signature(call_hits).parameters["batch_size"].default,
1114        help="The batch size used for optimization.",
1115    )
1116    call_hits_parser.add_argument(
1117        "-d",
1118        "--device",
1119        type=str,
1120        default=None,
1121        help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.",
1122    )
1123    call_hits_parser.add_argument(
1124        "-J",
1125        "--compile",
1126        action="store_true",
1127        help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.",
1128    )
1129
1130    report_parser = subparsers.add_parser(
1131        "report",
1132        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1133        help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.",
1134    )
1135
1136    report_parser.add_argument(
1137        "-r",
1138        "--regions",
1139        type=str,
1140        required=True,
1141        help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.",
1142    )
1143    report_parser.add_argument(
1144        "-H",
1145        "--hits",
1146        type=str,
1147        required=True,
1148        help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.",
1149    )
1150    report_parser.add_argument(
1151        "-p",
1152        "--peaks",
1153        type=str,
1154        default=None,
1155        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
1156    )
1157    report_parser.add_argument(
1158        "-m",
1159        "--modisco-h5",
1160        type=str,
1161        default=None,
1162        help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.",
1163    )
1164    report_parser.add_argument(
1165        "-I",
1166        "--motifs-include",
1167        type=str,
1168        default=None,
1169        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1170    )
1171    report_parser.add_argument(
1172        "-N",
1173        "--motif-names",
1174        type=str,
1175        default=None,
1176        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1177    )
1178
1179    report_parser.add_argument(
1180        "-o",
1181        "--out-dir",
1182        type=str,
1183        required=True,
1184        help="The path to the report output directory.",
1185    )
1186
1187    report_parser.add_argument(
1188        "-W",
1189        "--modisco-region-width",
1190        type=int,
1191        default=inspect.signature(report).parameters["modisco_region_width"].default,
1192        help="The width of the region around each peak summit used by tfmodisco-lite.",
1193    )
1194    report_parser.add_argument(
1195        "-t",
1196        "--cwm-trim-threshold",
1197        type=float,
1198        default=inspect.signature(report).parameters["cwm_trim_threshold"].default,
1199        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1200    )
1201    report_parser.add_argument(
1202        "-n",
1203        "--no-recall",
1204        action="store_true",
1205        help="Do not compute motif recall metrics.",
1206    )
1207    report_parser.add_argument(
1208        "-s",
1209        "--no-seqlets",
1210        action="store_true",
1211        help="DEPRECATED: Please omit the `--modisco-h5` argument instead.",
1212    )
1213
1214    collapse_hits_parser = subparsers.add_parser(
1215        "collapse-hits",
1216        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1217        help="Identify best hit by motif similarity among sets of overlapping hits.",
1218    )
1219
1220    collapse_hits_parser.add_argument(
1221        "-i",
1222        "--hits",
1223        type=str,
1224        required=True,
1225        help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.",
1226    )
1227    collapse_hits_parser.add_argument(
1228        "-o",
1229        "--out-path",
1230        type=str,
1231        required=True,
1232        help='The path to the output .tsv file with an additional "is_primary" column.',
1233    )
1234    collapse_hits_parser.add_argument(
1235        "-O",
1236        "--overlap-frac",
1237        type=float,
1238        default=inspect.signature(collapse_hits).parameters["overlap_frac"].default,
1239        help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.",
1240    )
1241
1242    intersect_hits_parser = subparsers.add_parser(
1243        "intersect-hits",
1244        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1245        help="Intersect hits across multiple runs.",
1246    )
1247
1248    intersect_hits_parser.add_argument(
1249        "-i",
1250        "--hits",
1251        type=str,
1252        required=True,
1253        nargs="+",
1254        help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.",
1255    )
1256    intersect_hits_parser.add_argument(
1257        "-o",
1258        "--out-path",
1259        type=str,
1260        required=True,
1261        help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.",
1262    )
1263    intersect_hits_parser.add_argument(
1264        "-r",
1265        "--relaxed",
1266        action="store_true",
1267        help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.",
1268    )
1269
1270    args = parser.parse_args()
1271
1272    if args.cmd == "extract-regions-bw":
1273        extract_regions_bw(
1274            args.peaks,
1275            args.chrom_order,
1276            args.fasta,
1277            args.bigwigs,
1278            args.out_path,
1279            args.region_width,
1280        )
1281
1282    elif args.cmd == "extract-regions-chrombpnet-h5":
1283        extract_regions_chrombpnet_h5(
1284            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1285        )
1286
1287    elif args.cmd == "extract-regions-h5":
1288        print(
1289            "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead."
1290        )
1291        extract_regions_chrombpnet_h5(
1292            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1293        )
1294
1295    elif args.cmd == "extract-regions-bpnet-h5":
1296        extract_regions_bpnet_h5(
1297            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1298        )
1299
1300    elif args.cmd == "extract-regions-modisco-fmt":
1301        extract_regions_modisco_fmt(
1302            args.peaks,
1303            args.chrom_order,
1304            args.attributions,
1305            args.sequences,
1306            args.out_path,
1307            args.region_width,
1308        )
1309
1310    elif args.cmd == "call-hits":
1311        if args.alpha is not None:
1312            warnings.warn(
1313                "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead."
1314            )
1315            args.global_lambda = args.alpha
1316        if args.motif_alphas is not None:
1317            warnings.warn(
1318                "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead."
1319            )
1320            args.motif_lambdas = args.motif_alphas
1321
1322        call_hits(
1323            args.regions,
1324            args.peaks,
1325            args.modisco_h5,
1326            args.chrom_order,
1327            args.motifs_include,
1328            args.motif_names,
1329            args.motif_lambdas,
1330            args.out_dir,
1331            args.cwm_trim_coords,
1332            args.cwm_trim_thresholds,
1333            args.cwm_trim_threshold,
1334            args.global_lambda,
1335            args.step_size_max,
1336            args.step_size_min,
1337            args.sqrt_transform,
1338            args.convergence_tol,
1339            args.max_steps,
1340            args.batch_size,
1341            args.step_adjust,
1342            args.device,
1343            args.mode,
1344            args.no_post_filter,
1345            args.compile,
1346        )
1347
1348    elif args.cmd == "report":
1349        report(
1350            args.regions,
1351            args.hits,
1352            args.modisco_h5,
1353            args.peaks,
1354            args.motifs_include,
1355            args.motif_names,
1356            args.out_dir,
1357            args.modisco_region_width,
1358            args.cwm_trim_threshold,
1359            not args.no_recall,
1360            not args.no_seqlets,
1361        )
1362
1363    elif args.cmd == "collapse-hits":
1364        collapse_hits(args.hits, args.out_path, args.overlap_frac)
1365
1366    elif args.cmd == "intersect-hits":
1367        intersect_hits(args.hits, args.out_path, args.relaxed)
def extract_regions_bw( peaks_path: str, chrom_order_path: Optional[str], fa_path: str, bw_paths: List[str], out_path: str, region_width: int = 1000) -> None:
25def extract_regions_bw(
26    peaks_path: str,
27    chrom_order_path: Optional[str],
28    fa_path: str,
29    bw_paths: List[str],
30    out_path: str,
31    region_width: int = 1000,
32) -> None:
33    """Extract genomic regions and contribution scores from bigWig and FASTA files.
34
35    Parameters
36    ----------
37    peaks_path : str
38        Path to ENCODE NarrowPeak format file.
39    chrom_order_path : str, optional
40        Path to chromosome ordering file.
41    fa_path : str
42        Path to genome FASTA file.
43    bw_paths : List[str]
44        List of bigWig file paths containing contribution scores.
45    out_path : str
46        Output path for NPZ file.
47    region_width : int, default 1000
48        Width of regions to extract around peak summits.
49
50    Notes
51    -----
52    BigWig files only provide projected contribution scores.
53    """
54    half_width = region_width // 2
55
56    # Load peak regions and extract sequences/contributions
57    peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
58    sequences, contribs = data_io.load_regions_from_bw(
59        peaks_df, fa_path, bw_paths, half_width
60    )
61
62    # Save processed data to NPZ format
63    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from bigWig and FASTA files.

Parameters
  • peaks_path (str): Path to ENCODE NarrowPeak format file.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • fa_path (str): Path to genome FASTA file.
  • bw_paths (List[str]): List of bigWig file paths containing contribution scores.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
Notes

BigWig files only provide projected contribution scores.

def extract_regions_chrombpnet_h5( peaks_path: Optional[str], chrom_order_path: Optional[str], h5_paths: List[str], out_path: str, region_width: int = 1000) -> None:
66def extract_regions_chrombpnet_h5(
67    peaks_path: Optional[str],
68    chrom_order_path: Optional[str],
69    h5_paths: List[str],
70    out_path: str,
71    region_width: int = 1000,
72) -> None:
73    """Extract genomic regions and contribution scores from ChromBPNet HDF5 files.
74
75    Parameters
76    ----------
77    peaks_path : str, optional
78        Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
79    chrom_order_path : str, optional
80        Path to chromosome ordering file.
81    h5_paths : List[str]
82        List of ChromBPNet HDF5 file paths.
83    out_path : str
84        Output path for NPZ file.
85    region_width : int, default 1000
86        Width of regions to extract around peak summits.
87    """
88    half_width = region_width // 2
89
90    if peaks_path is not None:
91        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
92    else:
93        peaks_df = None
94
95    sequences, contribs = data_io.load_regions_from_chrombpnet_h5(h5_paths, half_width)
96
97    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from ChromBPNet HDF5 files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • h5_paths (List[str]): List of ChromBPNet HDF5 file paths.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def extract_regions_bpnet_h5( peaks_path: Optional[str], chrom_order_path: Optional[str], h5_paths: List[str], out_path: str, region_width: int = 1000) -> None:
100def extract_regions_bpnet_h5(
101    peaks_path: Optional[str],
102    chrom_order_path: Optional[str],
103    h5_paths: List[str],
104    out_path: str,
105    region_width: int = 1000,
106) -> None:
107    """Extract genomic regions and contribution scores from BPNet HDF5 files.
108
109    Parameters
110    ----------
111    peaks_path : str, optional
112        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
113    chrom_order_path : str, optional
114        Path to chromosome ordering file.
115    h5_paths : List[str]
116        List of BPNet HDF5 file paths.
117    out_path : str
118        Output path for NPZ file.
119    region_width : int, default 1000
120        Width of regions to extract around peak summits.
121    """
122    half_width = region_width // 2
123
124    if peaks_path is not None:
125        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
126    else:
127        peaks_df = None
128
129    sequences, contribs = data_io.load_regions_from_bpnet_h5(h5_paths, half_width)
130
131    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from BPNet HDF5 files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • h5_paths (List[str]): List of BPNet HDF5 file paths.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def extract_regions_modisco_fmt( peaks_path: Optional[str], chrom_order_path: Optional[str], shaps_paths: List[str], ohe_path: str, out_path: str, region_width: int = 1000) -> None:
134def extract_regions_modisco_fmt(
135    peaks_path: Optional[str],
136    chrom_order_path: Optional[str],
137    shaps_paths: List[str],
138    ohe_path: str,
139    out_path: str,
140    region_width: int = 1000,
141) -> None:
142    """Extract genomic regions and contribution scores from TF-MoDISco format files.
143
144    Parameters
145    ----------
146    peaks_path : str, optional
147        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
148    chrom_order_path : str, optional
149        Path to chromosome ordering file.
150    shaps_paths : List[str]
151        List of paths to .npy/.npz files containing SHAP/attribution scores.
152    ohe_path : str
153        Path to .npy/.npz file containing one-hot encoded sequences.
154    out_path : str
155        Output path for NPZ file.
156    region_width : int, default 1000
157        Width of regions to extract around peak summits.
158    """
159    half_width = region_width // 2
160
161    if peaks_path is not None:
162        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
163    else:
164        peaks_df = None
165
166    sequences, contribs = data_io.load_regions_from_modisco_fmt(
167        shaps_paths, ohe_path, half_width
168    )
169
170    data_io.write_regions_npz(sequences, contribs, out_path, peaks_df=peaks_df)

Extract genomic regions and contribution scores from TF-MoDISco format files.

Parameters
  • peaks_path (str, optional): Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
  • chrom_order_path (str, optional): Path to chromosome ordering file.
  • shaps_paths (List[str]): List of paths to .npy/.npz files containing SHAP/attribution scores.
  • ohe_path (str): Path to .npy/.npz file containing one-hot encoded sequences.
  • out_path (str): Output path for NPZ file.
  • region_width (int, default 1000): Width of regions to extract around peak summits.
def call_hits( regions_path: str, peaks_path: Optional[str], modisco_h5_path: str, chrom_order_path: Optional[str], motifs_include_path: Optional[str], motif_names_path: Optional[str], motif_lambdas_path: Optional[str], out_dir: str, cwm_trim_coords_path: Optional[str] = None, cwm_trim_thresholds_path: Optional[str] = None, cwm_trim_threshold_default: float = 0.3, lambda_default: float = 0.7, step_size_max: float = 3.0, step_size_min: float = 0.08, sqrt_transform: bool = False, convergence_tol: float = 0.0005, max_steps: int = 10000, batch_size: int = 2000, step_adjust: float = 0.7, device: Optional[str] = None, mode: str = 'pp', no_post_filter: bool = False, compile_optimizer: bool = False) -> None:
173def call_hits(
174    regions_path: str,
175    peaks_path: Optional[str],
176    modisco_h5_path: str,
177    chrom_order_path: Optional[str],
178    motifs_include_path: Optional[str],
179    motif_names_path: Optional[str],
180    motif_lambdas_path: Optional[str],
181    out_dir: str,
182    cwm_trim_coords_path: Optional[str] = None,
183    cwm_trim_thresholds_path: Optional[str] = None,
184    cwm_trim_threshold_default: float = 0.3,
185    lambda_default: float = 0.7,
186    step_size_max: float = 3.0,
187    step_size_min: float = 0.08,
188    sqrt_transform: bool = False,
189    convergence_tol: float = 0.0005,
190    max_steps: int = 10000,
191    batch_size: int = 2000,
192    step_adjust: float = 0.7,
193    device: Optional[str] = None,
194    mode: str = "pp",
195    no_post_filter: bool = False,
196    compile_optimizer: bool = False,
197) -> None:
198    """Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.
199
200    This function implements the core Fi-NeMo hit calling pipeline, which identifies
201    motif instances by solving a sparse reconstruction problem using proximal gradient
202    descent. The algorithm represents contribution scores as weighted combinations of
203    motif CWMs at specific positions.
204
205    Parameters
206    ----------
207    regions_path : str
208        Path to NPZ file containing preprocessed regions (sequences, contributions,
209        and optional peak coordinates).
210    peaks_path : str, optional
211        DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be
212        included during preprocessing instead.
213    modisco_h5_path : str
214        Path to TF-MoDISco H5 file containing motif CWMs.
215    chrom_order_path : str, optional
216        DEPRECATED. Path to chromosome ordering file.
217    motifs_include_path : str, optional
218        Path to file listing motif names to include in analysis.
219    motif_names_path : str, optional
220        Path to file mapping motif IDs to custom names.
221    motif_lambdas_path : str, optional
222        Path to file specifying per-motif lambda values.
223    out_dir : str
224        Output directory for results.
225    cwm_trim_coords_path : str, optional
226        Path to file specifying custom motif trimming coordinates.
227    cwm_trim_thresholds_path : str, optional
228        Path to file specifying custom motif trimming thresholds.
229    cwm_trim_threshold_default : float, default 0.3
230        Default threshold for motif trimming.
231    lambda_default : float, default 0.7
232        Default L1 regularization weight.
233    step_size_max : float, default 3.0
234        Maximum optimization step size.
235    step_size_min : float, default 0.08
236        Minimum optimization step size.
237    sqrt_transform : bool, default False
238        Whether to apply signed square root transform to contributions.
239    convergence_tol : float, default 0.0005
240        Convergence tolerance for duality gap.
241    max_steps : int, default 10000
242        Maximum number of optimization steps.
243    batch_size : int, default 2000
244        Batch size for GPU processing.
245    step_adjust : float, default 0.7
246        Step size adjustment factor on divergence.
247    device : str, optional
248        DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
249    mode : str, default "pp"
250        Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
251    no_post_filter : bool, default False
252        If True, skip post-hit-calling similarity filtering.
253    compile_optimizer : bool, default False
254        Whether to JIT-compile the optimizer for speed.
255
256    Notes
257    -----
258    The Fi-NeMo algorithm solves the optimization problem:
259    minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁
260    subject to: c ≥ 0
261
262    where c represents motif hit coefficients and reconstruction uses convolution
263    with motif CWMs.
264    """
265
266    params = locals()
267    import torch
268    from . import hitcaller
269
270    if device is not None:
271        warnings.warn(
272            "The `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device."
273        )
274
275    sequences, contribs, peaks_df, has_peaks = data_io.load_regions_npz(regions_path)
276
277    region_width = sequences.shape[2]
278    if region_width % 2 != 0:
279        raise ValueError(f"Region width of {region_width} is not divisible by 2.")
280
281    half_width = region_width // 2
282    num_regions = contribs.shape[0]
283
284    if peaks_path is not None:
285        warnings.warn(
286            "Providing a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
287        )
288        peaks_df = data_io.load_peaks(peaks_path, chrom_order_path, half_width)
289        has_peaks = True
290
291    if not has_peaks:
292        warnings.warn(
293            "No peak region data provided. Output hits will lack absolute genomic coordinates."
294        )
295
296    if mode == "pp":
297        motif_type = "cwm"
298        use_hypothetical_contribs = False
299    elif mode == "ph":
300        motif_type = "cwm"
301        use_hypothetical_contribs = True
302    elif mode == "hp":
303        motif_type = "hcwm"
304        use_hypothetical_contribs = False
305    elif mode == "hh":
306        motif_type = "hcwm"
307        use_hypothetical_contribs = True
308    else:
309        raise ValueError(
310            f"Invalid mode: {mode}. Must be one of 'pp', 'ph', 'hp', 'hh'."
311        )
312
313    if motifs_include_path is not None:
314        motifs_include = data_io.load_txt(motifs_include_path)
315    else:
316        motifs_include = None
317
318    if motif_names_path is not None:
319        motif_name_map = data_io.load_mapping(motif_names_path, str)
320    else:
321        motif_name_map = None
322
323    if motif_lambdas_path is not None:
324        motif_lambdas = data_io.load_mapping(motif_lambdas_path, float)
325    else:
326        motif_lambdas = None
327
328    if cwm_trim_coords_path is not None:
329        trim_coords = data_io.load_mapping_tuple(cwm_trim_coords_path, int)
330    else:
331        trim_coords = None
332
333    if cwm_trim_thresholds_path is not None:
334        trim_thresholds = data_io.load_mapping(cwm_trim_thresholds_path, float)
335    else:
336        trim_thresholds = None
337
338    motifs_df, cwms, trim_masks, _ = data_io.load_modisco_motifs(
339        modisco_h5_path,
340        trim_coords,
341        trim_thresholds,
342        cwm_trim_threshold_default,
343        motif_type,
344        motifs_include,
345        motif_name_map,
346        motif_lambdas,
347        lambda_default,
348        True,
349    )
350    num_motifs = cwms.shape[0]
351    motif_width = cwms.shape[2]
352    lambdas = motifs_df.get_column("lambda").to_numpy(writable=True)
353
354    device_obj = torch.device(device) if device is not None else None
355    hits_df, qc_df = hitcaller.fit_contribs(
356        cwms,
357        contribs,
358        sequences,
359        trim_masks,
360        use_hypothetical_contribs,
361        lambdas,
362        step_size_max,
363        step_size_min,
364        sqrt_transform,
365        convergence_tol,
366        max_steps,
367        batch_size,
368        step_adjust,
369        not no_post_filter,
370        device_obj,
371        compile_optimizer,
372    )
373
374    os.makedirs(out_dir, exist_ok=True)
375    out_path_qc = os.path.join(out_dir, "peaks_qc.tsv")
376    data_io.write_hits(hits_df, peaks_df, motifs_df, qc_df, out_dir, motif_width)
377    data_io.write_qc(qc_df, peaks_df, out_path_qc)
378
379    out_path_motif_df = os.path.join(out_dir, "motif_data.tsv")
380    data_io.write_motifs_df(motifs_df, out_path_motif_df)
381
382    out_path_motif_cwms = os.path.join(out_dir, "motif_cwms.npy")
383    data_io.write_motif_cwms(cwms, out_path_motif_cwms)
384
385    params |= {
386        "region_width": region_width,
387        "num_regions": num_regions,
388        "untrimmed_motif_width": motif_width,
389        "num_motifs": num_motifs,
390    }
391
392    out_path_params = os.path.join(out_dir, "parameters.json")
393    data_io.write_params(params, out_path_params)

Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.

This function implements the core Fi-NeMo hit calling pipeline, which identifies motif instances by solving a sparse reconstruction problem using proximal gradient descent. The algorithm represents contribution scores as weighted combinations of motif CWMs at specific positions.

Parameters
  • regions_path (str): Path to NPZ file containing preprocessed regions (sequences, contributions, and optional peak coordinates).
  • peaks_path (str, optional): DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be included during preprocessing instead.
  • modisco_h5_path (str): Path to TF-MoDISco H5 file containing motif CWMs.
  • chrom_order_path (str, optional): DEPRECATED. Path to chromosome ordering file.
  • motifs_include_path (str, optional): Path to file listing motif names to include in analysis.
  • motif_names_path (str, optional): Path to file mapping motif IDs to custom names.
  • motif_lambdas_path (str, optional): Path to file specifying per-motif lambda values.
  • out_dir (str): Output directory for results.
  • cwm_trim_coords_path (str, optional): Path to file specifying custom motif trimming coordinates.
  • cwm_trim_thresholds_path (str, optional): Path to file specifying custom motif trimming thresholds.
  • cwm_trim_threshold_default (float, default 0.3): Default threshold for motif trimming.
  • lambda_default (float, default 0.7): Default L1 regularization weight.
  • step_size_max (float, default 3.0): Maximum optimization step size.
  • step_size_min (float, default 0.08): Minimum optimization step size.
  • sqrt_transform (bool, default False): Whether to apply signed square root transform to contributions.
  • convergence_tol (float, default 0.0005): Convergence tolerance for duality gap.
  • max_steps (int, default 10000): Maximum number of optimization steps.
  • batch_size (int, default 2000): Batch size for GPU processing.
  • step_adjust (float, default 0.7): Step size adjustment factor on divergence.
  • device (str, optional): DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
  • mode (str, default "pp"): Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
  • no_post_filter (bool, default False): If True, skip post-hit-calling similarity filtering.
  • compile_optimizer (bool, default False): Whether to JIT-compile the optimizer for speed.
Notes

The Fi-NeMo algorithm solves the optimization problem: minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁ subject to: c ≥ 0

where c represents motif hit coefficients and reconstruction uses convolution with motif CWMs.

def report( regions_path: str, hits_dir: str, modisco_h5_path: Optional[str], peaks_path: Optional[str], motifs_include_path: Optional[str], motif_names_path: Optional[str], out_dir: str, modisco_region_width: int = 400, cwm_trim_threshold: float = 0.3, compute_recall: bool = True, use_seqlets: bool = True) -> None:
396def report(
397    regions_path: str,
398    hits_dir: str,
399    modisco_h5_path: Optional[str],
400    peaks_path: Optional[str],
401    motifs_include_path: Optional[str],
402    motif_names_path: Optional[str],
403    out_dir: str,
404    modisco_region_width: int = 400,
405    cwm_trim_threshold: float = 0.3,
406    compute_recall: bool = True,
407    use_seqlets: bool = True,
408) -> None:
409    """Generate comprehensive HTML report with statistics and visualizations.
410
411    This function creates detailed analysis reports comparing Fi-NeMo hit calling
412    results with TF-MoDISco seqlets, including performance metrics, distribution
413    plots, and motif visualization. The report provides insights into hit calling
414    quality and motif discovery accuracy.
415
416    Parameters
417    ----------
418    regions_path : str
419        Path to NPZ file containing the same regions used for hit calling.
420    hits_dir : str
421        Path to directory containing Fi-NeMo hit calling outputs.
422    modisco_h5_path : str, optional
423        Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
424    peaks_path : str, optional
425        DEPRECATED. Peak coordinates should be included in regions file.
426    motifs_include_path : str, optional
427        DEPRECATED. This information is inferred from hit calling outputs.
428    motif_names_path : str, optional
429        DEPRECATED. This information is inferred from hit calling outputs.
430    out_dir : str
431        Output directory for report files.
432    modisco_region_width : int, default 400
433        Width of regions used by TF-MoDISco (needed for coordinate conversion).
434    cwm_trim_threshold : float, default 0.3
435        DEPRECATED. This information is inferred from hit calling outputs.
436    compute_recall : bool, default True
437        Whether to compute recall metrics against TF-MoDISco seqlets.
438    use_seqlets : bool, default True
439        Whether to include seqlet-based comparisons in the report.
440
441    Notes
442    -----
443    The generated report includes:
444    - Hit vs seqlet count comparisons
445    - Motif CWM visualizations
446    - Hit statistic distributions
447    - Co-occurrence heatmaps
448    - Confusion matrices for overlapping motifs
449    """
450    from . import evaluation, visualization
451
452    sequences, contribs, peaks_df, _ = data_io.load_regions_npz(regions_path)
453    if len(contribs.shape) == 3:
454        regions = contribs * sequences
455    elif len(contribs.shape) == 2:
456        regions = contribs[:, None, :] * sequences
457    else:
458        raise ValueError(f"Unexpected contribs shape: {contribs.shape}")
459
460    half_width = regions.shape[2] // 2
461    modisco_half_width = modisco_region_width // 2
462
463    if peaks_path is not None:
464        warnings.warn(
465            "Providing a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`."
466        )
467        peaks_df = data_io.load_peaks(peaks_path, None, half_width)
468
469    if hits_dir.endswith(".tsv"):
470        warnings.warn(
471            "Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead."
472        )
473
474        hits_path = hits_dir
475
476        hits_df = data_io.load_hits(hits_path, lazy=True)
477
478        if motifs_include_path is not None:
479            motifs_include = data_io.load_txt(motifs_include_path)
480        else:
481            motifs_include = None
482
483        if motif_names_path is not None:
484            motif_name_map = data_io.load_mapping(motif_names_path, str)
485        else:
486            motif_name_map = None
487
488        if modisco_h5_path is not None:
489            motifs_df, cwms_modisco, _, motif_names = data_io.load_modisco_motifs(
490                modisco_h5_path,
491                None,
492                None,
493                cwm_trim_threshold,
494                "cwm",
495                motifs_include,
496                motif_name_map,
497                None,
498                1.0,
499                True,
500            )
501        else:
502            # When no modisco_h5_path is provided in legacy TSV mode, we can't compute motifs
503            # This will cause an error later, but that's expected behavior
504            raise ValueError(
505                "modisco_h5_path is required when providing a hits.tsv file directly"
506            )
507
508    else:
509        hits_df_path = os.path.join(hits_dir, "hits.tsv")
510        hits_df = data_io.load_hits(hits_df_path, lazy=True)
511
512        motifs_df_path = os.path.join(hits_dir, "motif_data.tsv")
513        motifs_df, motif_names = data_io.load_motifs_df(motifs_df_path)
514
515        cwms_modisco_path = os.path.join(hits_dir, "motif_cwms.npy")
516        cwms_modisco = data_io.load_motif_cwms(cwms_modisco_path)
517
518        params_path = os.path.join(hits_dir, "parameters.json")
519        params = data_io.load_params(params_path)
520        cwm_trim_threshold = params["cwm_trim_threshold_default"]
521
522    if not use_seqlets:
523        warnings.warn(
524            "Usage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead."
525        )
526        seqlets_df = None
527    elif modisco_h5_path is None:
528        compute_recall = False
529        seqlets_df = None
530    else:
531        seqlets_df = data_io.load_modisco_seqlets(
532            modisco_h5_path,
533            peaks_df,
534            motifs_df,
535            half_width,
536            modisco_half_width,
537            lazy=True,
538        )
539
540    motif_width = cwms_modisco.shape[2]
541
542    # Convert to LazyFrame if needed and ensure motif_names is a list
543    if isinstance(hits_df, pl.LazyFrame):
544        hits_df_lazy: pl.LazyFrame = hits_df
545    else:
546        hits_df_lazy: pl.LazyFrame = hits_df.lazy()
547
548    motif_names_list: List[str] = list(motif_names)
549
550    occ_df, coooc = evaluation.get_motif_occurences(hits_df_lazy, motif_names_list)
551
552    report_data, report_df, motifs, trim_bounds = evaluation.tfmodisco_comparison(
553        regions,
554        sequences,
555        hits_df,
556        peaks_df,
557        seqlets_df,
558        motifs_df,
559        cwms_modisco,
560        motif_names_list,
561        modisco_half_width,
562        motif_width,
563        compute_recall,
564    )
565
566    if seqlets_df is not None:
567        confusion_df, confusion_mat = evaluation.seqlet_confusion(
568            hits_df, seqlets_df, peaks_df, motif_names_list, motif_width
569        )
570    else:
571        confusion_df, confusion_mat = None, None
572
573    os.makedirs(out_dir, exist_ok=True)
574
575    occ_path = os.path.join(out_dir, "motif_occurrences.tsv")
576    data_io.write_occ_df(occ_df, occ_path)
577
578    data_io.write_report_data(report_df, motifs, out_dir)
579
580    visualization.plot_hit_stat_distributions(hits_df_lazy, motif_names_list, out_dir)
581    visualization.plot_hit_peak_distributions(occ_df, motif_names_list, out_dir)
582    visualization.plot_peak_motif_indicator_heatmap(coooc, motif_names_list, out_dir)
583
584    plot_dir = os.path.join(out_dir, "motifs")
585    visualization.plot_motifs(motifs, trim_bounds, plot_dir)
586
587    if seqlets_df is not None:
588        seqlets_collected = (
589            seqlets_df.collect() if isinstance(seqlets_df, pl.LazyFrame) else seqlets_df
590        )
591        seqlets_path = os.path.join(out_dir, "seqlets.tsv")
592        data_io.write_modisco_seqlets(seqlets_collected, seqlets_path)
593
594        if confusion_df is not None and confusion_mat is not None:
595            seqlet_confusion_path = os.path.join(out_dir, "seqlet_confusion.tsv")
596            data_io.write_seqlet_confusion_df(confusion_df, seqlet_confusion_path)
597
598            visualization.plot_hit_vs_seqlet_counts(report_data, out_dir)
599            visualization.plot_seqlet_confusion_heatmap(
600                confusion_mat, motif_names_list, out_dir
601            )
602
603    report_path = os.path.join(out_dir, "report.html")
604    visualization.write_report(
605        report_df, motif_names_list, report_path, compute_recall, seqlets_df is not None
606    )

Generate comprehensive HTML report with statistics and visualizations.

This function creates detailed analysis reports comparing Fi-NeMo hit calling results with TF-MoDISco seqlets, including performance metrics, distribution plots, and motif visualization. The report provides insights into hit calling quality and motif discovery accuracy.

Parameters
  • regions_path (str): Path to NPZ file containing the same regions used for hit calling.
  • hits_dir (str): Path to directory containing Fi-NeMo hit calling outputs.
  • modisco_h5_path (str, optional): Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
  • peaks_path (str, optional): DEPRECATED. Peak coordinates should be included in regions file.
  • motifs_include_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
  • motif_names_path (str, optional): DEPRECATED. This information is inferred from hit calling outputs.
  • out_dir (str): Output directory for report files.
  • modisco_region_width (int, default 400): Width of regions used by TF-MoDISco (needed for coordinate conversion).
  • cwm_trim_threshold (float, default 0.3): DEPRECATED. This information is inferred from hit calling outputs.
  • compute_recall (bool, default True): Whether to compute recall metrics against TF-MoDISco seqlets.
  • use_seqlets (bool, default True): Whether to include seqlet-based comparisons in the report.
Notes

The generated report includes:

  • Hit vs seqlet count comparisons
  • Motif CWM visualizations
  • Hit statistic distributions
  • Co-occurrence heatmaps
  • Confusion matrices for overlapping motifs
def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
609def collapse_hits(hits_path: str, out_path: str, overlap_frac: float = 0.2) -> None:
610    """Collapse overlapping hits by selecting the best hit per overlapping group.
611
612    This function processes a set of motif hits and identifies overlapping hits,
613    keeping only the hit with the highest similarity score within each overlapping
614    group. This reduces redundancy in hit calls while preserving the most confident
615    predictions.
616
617    Parameters
618    ----------
619    hits_path : str
620        Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
621    out_path : str
622        Path to output TSV file with additional 'is_primary' column.
623    overlap_frac : float, default 0.2
624        Minimum fractional overlap for considering hits as overlapping.
625        For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
626
627    Notes
628    -----
629    The algorithm uses a sweep line approach with a heap data structure to
630    efficiently identify overlapping intervals and select the best hit based
631    on similarity scores.
632    """
633    from . import postprocessing
634
635    hits_df = data_io.load_hits(hits_path, lazy=False)
636    hits_collapsed_df = postprocessing.collapse_hits(hits_df, overlap_frac)
637
638    data_io.write_hits_processed(
639        hits_collapsed_df, out_path, schema=data_io.HITS_COLLAPSED_DTYPES
640    )

Collapse overlapping hits by selecting the best hit per overlapping group.

This function processes a set of motif hits and identifies overlapping hits, keeping only the hit with the highest similarity score within each overlapping group. This reduces redundancy in hit calls while preserving the most confident predictions.

Parameters
  • hits_path (str): Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
  • out_path (str): Path to output TSV file with additional 'is_primary' column.
  • overlap_frac (float, default 0.2): Minimum fractional overlap for considering hits as overlapping. For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
Notes

The algorithm uses a sweep line approach with a heap data structure to efficiently identify overlapping intervals and select the best hit based on similarity scores.

def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
643def intersect_hits(hits_paths: List[str], out_path: str, relaxed: bool = False) -> None:
644    """Find intersection of hits across multiple Fi-NeMo runs.
645
646    This function identifies motif hits that are consistently called across
647    multiple independent runs, providing a way to assess reproducibility and
648    identify high-confidence hits.
649
650    Parameters
651    ----------
652    hits_paths : List[str]
653        List of paths to input TSV files from different runs.
654    out_path : str
655        Path to output TSV file containing intersection results.
656        Duplicate columns are suffixed with run index.
657    relaxed : bool, default False
658        If True, uses relaxed intersection criteria based only on motif names
659        and untrimmed coordinates. If False, assumes consistent region definitions
660        and motif trimming across runs.
661
662    Notes
663    -----
664    The strict intersection mode requires consistent input regions and motif
665    processing parameters across all runs. The relaxed mode is more permissive
666    but may not be suitable when genomic coordinates are unavailable.
667    """
668    from . import postprocessing
669
670    hits_dfs = [data_io.load_hits(hits_path, lazy=False) for hits_path in hits_paths]
671    hits_df = postprocessing.intersect_hits(hits_dfs, relaxed)
672
673    data_io.write_hits_processed(hits_df, out_path, schema=None)

Find intersection of hits across multiple Fi-NeMo runs.

This function identifies motif hits that are consistently called across multiple independent runs, providing a way to assess reproducibility and identify high-confidence hits.

Parameters
  • hits_paths (List[str]): List of paths to input TSV files from different runs.
  • out_path (str): Path to output TSV file containing intersection results. Duplicate columns are suffixed with run index.
  • relaxed (bool, default False): If True, uses relaxed intersection criteria based only on motif names and untrimmed coordinates. If False, assumes consistent region definitions and motif trimming across runs.
Notes

The strict intersection mode requires consistent input regions and motif processing parameters across all runs. The relaxed mode is more permissive but may not be suitable when genomic coordinates are unavailable.

def cli() -> None:
 676def cli() -> None:
 677    """Command-line interface for the Fi-NeMo motif instance calling pipeline.
 678
 679    This function provides the main entry point for all Fi-NeMo operations including:
 680    - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
 681    - Motif hit calling using the Fi-NeMo algorithm
 682    - Report generation and visualization
 683    - Post-processing operations (hit collapsing, intersection)
 684    """
 685    parser = argparse.ArgumentParser()
 686    subparsers = parser.add_subparsers(required=True, dest="cmd")
 687
 688    extract_regions_bw_parser = subparsers.add_parser(
 689        "extract-regions-bw",
 690        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 691        help="Extract sequences and contributions from FASTA and bigwig files.",
 692    )
 693
 694    extract_regions_bw_parser.add_argument(
 695        "-p",
 696        "--peaks",
 697        type=str,
 698        required=True,
 699        help="A peak regions file in ENCODE NarrowPeak format.",
 700    )
 701    extract_regions_bw_parser.add_argument(
 702        "-C",
 703        "--chrom-order",
 704        type=str,
 705        default=None,
 706        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 707    )
 708    extract_regions_bw_parser.add_argument(
 709        "-f",
 710        "--fasta",
 711        type=str,
 712        required=True,
 713        help="A genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.",
 714    )
 715    extract_regions_bw_parser.add_argument(
 716        "-b",
 717        "--bigwigs",
 718        type=str,
 719        required=True,
 720        nargs="+",
 721        help="One or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 722    )
 723
 724    extract_regions_bw_parser.add_argument(
 725        "-o",
 726        "--out-path",
 727        type=str,
 728        required=True,
 729        help="The path to the output .npz file.",
 730    )
 731
 732    extract_regions_bw_parser.add_argument(
 733        "-w",
 734        "--region-width",
 735        type=int,
 736        default=inspect.signature(extract_regions_bw)
 737        .parameters["region_width"]
 738        .default,
 739        help="The width of the input region centered around each peak summit.",
 740    )
 741
 742    extract_chrombpnet_regions_h5_parser = subparsers.add_parser(
 743        "extract-regions-chrombpnet-h5",
 744        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 745        help="Extract sequences and contributions from ChromBPNet contributions H5 files.",
 746    )
 747
 748    extract_chrombpnet_regions_h5_parser.add_argument(
 749        "-p",
 750        "--peaks",
 751        type=str,
 752        default=None,
 753        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 754    )
 755    extract_chrombpnet_regions_h5_parser.add_argument(
 756        "-C",
 757        "--chrom-order",
 758        type=str,
 759        default=None,
 760        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 761    )
 762
 763    extract_chrombpnet_regions_h5_parser.add_argument(
 764        "-c",
 765        "--h5s",
 766        type=str,
 767        required=True,
 768        nargs="+",
 769        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 770    )
 771
 772    extract_chrombpnet_regions_h5_parser.add_argument(
 773        "-o",
 774        "--out-path",
 775        type=str,
 776        required=True,
 777        help="The path to the output .npz file.",
 778    )
 779
 780    extract_chrombpnet_regions_h5_parser.add_argument(
 781        "-w",
 782        "--region-width",
 783        type=int,
 784        default=inspect.signature(extract_regions_chrombpnet_h5)
 785        .parameters["region_width"]
 786        .default,
 787        help="The width of the input region centered around each peak summit.",
 788    )
 789
 790    extract_regions_h5_parser = subparsers.add_parser(
 791        "extract-regions-h5",
 792        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 793        help="Extract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.",
 794    )
 795
 796    extract_regions_h5_parser.add_argument(
 797        "-p",
 798        "--peaks",
 799        type=str,
 800        default=None,
 801        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 802    )
 803    extract_regions_h5_parser.add_argument(
 804        "-C",
 805        "--chrom-order",
 806        type=str,
 807        default=None,
 808        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 809    )
 810
 811    extract_regions_h5_parser.add_argument(
 812        "-c",
 813        "--h5s",
 814        type=str,
 815        required=True,
 816        nargs="+",
 817        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 818    )
 819
 820    extract_regions_h5_parser.add_argument(
 821        "-o",
 822        "--out-path",
 823        type=str,
 824        required=True,
 825        help="The path to the output .npz file.",
 826    )
 827
 828    extract_regions_h5_parser.add_argument(
 829        "-w",
 830        "--region-width",
 831        type=int,
 832        default=inspect.signature(extract_regions_chrombpnet_h5)
 833        .parameters["region_width"]
 834        .default,
 835        help="The width of the input region centered around each peak summit.",
 836    )
 837
 838    extract_bpnet_regions_h5_parser = subparsers.add_parser(
 839        "extract-regions-bpnet-h5",
 840        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 841        help="Extract sequences and contributions from BPNet contributions H5 files.",
 842    )
 843
 844    extract_bpnet_regions_h5_parser.add_argument(
 845        "-p",
 846        "--peaks",
 847        type=str,
 848        default=None,
 849        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 850    )
 851    extract_bpnet_regions_h5_parser.add_argument(
 852        "-C",
 853        "--chrom-order",
 854        type=str,
 855        default=None,
 856        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 857    )
 858
 859    extract_bpnet_regions_h5_parser.add_argument(
 860        "-c",
 861        "--h5s",
 862        type=str,
 863        required=True,
 864        nargs="+",
 865        help="One or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 866    )
 867
 868    extract_bpnet_regions_h5_parser.add_argument(
 869        "-o",
 870        "--out-path",
 871        type=str,
 872        required=True,
 873        help="The path to the output .npz file.",
 874    )
 875
 876    extract_bpnet_regions_h5_parser.add_argument(
 877        "-w",
 878        "--region-width",
 879        type=int,
 880        default=inspect.signature(extract_regions_bpnet_h5)
 881        .parameters["region_width"]
 882        .default,
 883        help="The width of the input region centered around each peak summit.",
 884    )
 885
 886    extract_regions_modisco_fmt_parser = subparsers.add_parser(
 887        "extract-regions-modisco-fmt",
 888        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 889        help="Extract sequences and contributions from tfmodisco-lite input files.",
 890    )
 891
 892    extract_regions_modisco_fmt_parser.add_argument(
 893        "-p",
 894        "--peaks",
 895        type=str,
 896        default=None,
 897        help="A peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.",
 898    )
 899    extract_regions_modisco_fmt_parser.add_argument(
 900        "-C",
 901        "--chrom-order",
 902        type=str,
 903        default=None,
 904        help="A tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.",
 905    )
 906
 907    extract_regions_modisco_fmt_parser.add_argument(
 908        "-s",
 909        "--sequences",
 910        type=str,
 911        required=True,
 912        help="A .npy or .npz file containing one-hot encoded sequences.",
 913    )
 914
 915    extract_regions_modisco_fmt_parser.add_argument(
 916        "-a",
 917        "--attributions",
 918        type=str,
 919        required=True,
 920        nargs="+",
 921        help="One or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.",
 922    )
 923
 924    extract_regions_modisco_fmt_parser.add_argument(
 925        "-o",
 926        "--out-path",
 927        type=str,
 928        required=True,
 929        help="The path to the output .npz file.",
 930    )
 931
 932    extract_regions_modisco_fmt_parser.add_argument(
 933        "-w",
 934        "--region-width",
 935        type=int,
 936        default=inspect.signature(extract_regions_modisco_fmt)
 937        .parameters["region_width"]
 938        .default,
 939        help="The width of the input region centered around each peak summit.",
 940    )
 941
 942    call_hits_parser = subparsers.add_parser(
 943        "call-hits",
 944        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 945        help="Call hits on provided sequences, contributions, and motif CWM's.",
 946    )
 947
 948    call_hits_parser.add_argument(
 949        "-M",
 950        "--mode",
 951        type=str,
 952        default=inspect.signature(call_hits).parameters["mode"].default,
 953        choices={"pp", "ph", "hp", "hh"},
 954        help="The type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.",
 955    )
 956
 957    call_hits_parser.add_argument(
 958        "-r",
 959        "--regions",
 960        type=str,
 961        required=True,
 962        help="A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.",
 963    )
 964    call_hits_parser.add_argument(
 965        "-m",
 966        "--modisco-h5",
 967        type=str,
 968        required=True,
 969        help="A tfmodisco-lite output H5 file of motif patterns.",
 970    )
 971
 972    call_hits_parser.add_argument(
 973        "-p",
 974        "--peaks",
 975        type=str,
 976        default=None,
 977        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 978    )
 979    call_hits_parser.add_argument(
 980        "-C",
 981        "--chrom-order",
 982        type=str,
 983        default=None,
 984        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
 985    )
 986
 987    call_hits_parser.add_argument(
 988        "-I",
 989        "--motifs-include",
 990        type=str,
 991        default=None,
 992        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.",
 993    )
 994    call_hits_parser.add_argument(
 995        "-N",
 996        "--motif-names",
 997        type=str,
 998        default=None,
 999        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.",
1000    )
1001
1002    call_hits_parser.add_argument(
1003        "-o",
1004        "--out-dir",
1005        type=str,
1006        required=True,
1007        help="The path to the output directory.",
1008    )
1009
1010    call_hits_parser.add_argument(
1011        "-t",
1012        "--cwm-trim-threshold",
1013        type=float,
1014        default=inspect.signature(call_hits)
1015        .parameters["cwm_trim_threshold_default"]
1016        .default,
1017        help="The default threshold to determine motif start and end positions within the full CWMs.",
1018    )
1019    call_hits_parser.add_argument(
1020        "-T",
1021        "--cwm-trim-thresholds",
1022        type=str,
1023        default=None,
1024        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.",
1025    )
1026    call_hits_parser.add_argument(
1027        "-R",
1028        "--cwm-trim-coords",
1029        type=str,
1030        default=None,
1031        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.",
1032    )
1033
1034    call_hits_parser.add_argument(
1035        "-l",
1036        "--global-lambda",
1037        type=float,
1038        default=inspect.signature(call_hits).parameters["lambda_default"].default,
1039        help="The default L1 regularization weight determining the sparsity of hits.",
1040    )
1041    call_hits_parser.add_argument(
1042        "-L",
1043        "--motif-lambdas",
1044        type=str,
1045        default=None,
1046        help="A tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.",
1047    )
1048    call_hits_parser.add_argument(
1049        "-a",
1050        "--alpha",
1051        type=float,
1052        default=None,
1053        help="DEPRECATED: Please use the `--lambda` argument instead.",
1054    )
1055    call_hits_parser.add_argument(
1056        "-A",
1057        "--motif-alphas",
1058        type=str,
1059        default=None,
1060        help="DEPRECATED: Please use the `--motif-lambdas` argument instead.",
1061    )
1062
1063    call_hits_parser.add_argument(
1064        "-f",
1065        "--no-post-filter",
1066        action="store_true",
1067        help="Do not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.",
1068    )
1069    call_hits_parser.add_argument(
1070        "-q",
1071        "--sqrt-transform",
1072        action="store_true",
1073        help="Apply a signed square root transform to the input contributions and CWMs before hit calling.",
1074    )
1075    call_hits_parser.add_argument(
1076        "-s",
1077        "--step-size-max",
1078        type=float,
1079        default=inspect.signature(call_hits).parameters["step_size_max"].default,
1080        help="The maximum optimizer step size.",
1081    )
1082    call_hits_parser.add_argument(
1083        "-i",
1084        "--step-size-min",
1085        type=float,
1086        default=inspect.signature(call_hits).parameters["step_size_min"].default,
1087        help="The minimum optimizer step size.",
1088    )
1089    call_hits_parser.add_argument(
1090        "-j",
1091        "--step-adjust",
1092        type=float,
1093        default=inspect.signature(call_hits).parameters["step_adjust"].default,
1094        help="The optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factor",
1095    )
1096    call_hits_parser.add_argument(
1097        "-c",
1098        "--convergence-tol",
1099        type=float,
1100        default=inspect.signature(call_hits).parameters["convergence_tol"].default,
1101        help="The tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.",
1102    )
1103    call_hits_parser.add_argument(
1104        "-S",
1105        "--max-steps",
1106        type=int,
1107        default=inspect.signature(call_hits).parameters["max_steps"].default,
1108        help="The maximum number of optimization steps.",
1109    )
1110    call_hits_parser.add_argument(
1111        "-b",
1112        "--batch-size",
1113        type=int,
1114        default=inspect.signature(call_hits).parameters["batch_size"].default,
1115        help="The batch size used for optimization.",
1116    )
1117    call_hits_parser.add_argument(
1118        "-d",
1119        "--device",
1120        type=str,
1121        default=None,
1122        help="DEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.",
1123    )
1124    call_hits_parser.add_argument(
1125        "-J",
1126        "--compile",
1127        action="store_true",
1128        help="JIT-compile the optimizer for faster performance. This may not be supported on older GPUs.",
1129    )
1130
1131    report_parser = subparsers.add_parser(
1132        "report",
1133        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1134        help="Generate statistics and visualizations from hits and tfmodisco-lite motif data.",
1135    )
1136
1137    report_parser.add_argument(
1138        "-r",
1139        "--regions",
1140        type=str,
1141        required=True,
1142        help="A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.",
1143    )
1144    report_parser.add_argument(
1145        "-H",
1146        "--hits",
1147        type=str,
1148        required=True,
1149        help="The output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.",
1150    )
1151    report_parser.add_argument(
1152        "-p",
1153        "--peaks",
1154        type=str,
1155        default=None,
1156        help="DEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.",
1157    )
1158    report_parser.add_argument(
1159        "-m",
1160        "--modisco-h5",
1161        type=str,
1162        default=None,
1163        help="The tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.",
1164    )
1165    report_parser.add_argument(
1166        "-I",
1167        "--motifs-include",
1168        type=str,
1169        default=None,
1170        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1171    )
1172    report_parser.add_argument(
1173        "-N",
1174        "--motif-names",
1175        type=str,
1176        default=None,
1177        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1178    )
1179
1180    report_parser.add_argument(
1181        "-o",
1182        "--out-dir",
1183        type=str,
1184        required=True,
1185        help="The path to the report output directory.",
1186    )
1187
1188    report_parser.add_argument(
1189        "-W",
1190        "--modisco-region-width",
1191        type=int,
1192        default=inspect.signature(report).parameters["modisco_region_width"].default,
1193        help="The width of the region around each peak summit used by tfmodisco-lite.",
1194    )
1195    report_parser.add_argument(
1196        "-t",
1197        "--cwm-trim-threshold",
1198        type=float,
1199        default=inspect.signature(report).parameters["cwm_trim_threshold"].default,
1200        help="DEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.",
1201    )
1202    report_parser.add_argument(
1203        "-n",
1204        "--no-recall",
1205        action="store_true",
1206        help="Do not compute motif recall metrics.",
1207    )
1208    report_parser.add_argument(
1209        "-s",
1210        "--no-seqlets",
1211        action="store_true",
1212        help="DEPRECATED: Please omit the `--modisco-h5` argument instead.",
1213    )
1214
1215    collapse_hits_parser = subparsers.add_parser(
1216        "collapse-hits",
1217        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1218        help="Identify best hit by motif similarity among sets of overlapping hits.",
1219    )
1220
1221    collapse_hits_parser.add_argument(
1222        "-i",
1223        "--hits",
1224        type=str,
1225        required=True,
1226        help="The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.",
1227    )
1228    collapse_hits_parser.add_argument(
1229        "-o",
1230        "--out-path",
1231        type=str,
1232        required=True,
1233        help='The path to the output .tsv file with an additional "is_primary" column.',
1234    )
1235    collapse_hits_parser.add_argument(
1236        "-O",
1237        "--overlap-frac",
1238        type=float,
1239        default=inspect.signature(collapse_hits).parameters["overlap_frac"].default,
1240        help="The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.",
1241    )
1242
1243    intersect_hits_parser = subparsers.add_parser(
1244        "intersect-hits",
1245        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1246        help="Intersect hits across multiple runs.",
1247    )
1248
1249    intersect_hits_parser.add_argument(
1250        "-i",
1251        "--hits",
1252        type=str,
1253        required=True,
1254        nargs="+",
1255        help="One or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.",
1256    )
1257    intersect_hits_parser.add_argument(
1258        "-o",
1259        "--out-path",
1260        type=str,
1261        required=True,
1262        help="The path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.",
1263    )
1264    intersect_hits_parser.add_argument(
1265        "-r",
1266        "--relaxed",
1267        action="store_true",
1268        help="Use relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.",
1269    )
1270
1271    args = parser.parse_args()
1272
1273    if args.cmd == "extract-regions-bw":
1274        extract_regions_bw(
1275            args.peaks,
1276            args.chrom_order,
1277            args.fasta,
1278            args.bigwigs,
1279            args.out_path,
1280            args.region_width,
1281        )
1282
1283    elif args.cmd == "extract-regions-chrombpnet-h5":
1284        extract_regions_chrombpnet_h5(
1285            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1286        )
1287
1288    elif args.cmd == "extract-regions-h5":
1289        print(
1290            "WARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead."
1291        )
1292        extract_regions_chrombpnet_h5(
1293            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1294        )
1295
1296    elif args.cmd == "extract-regions-bpnet-h5":
1297        extract_regions_bpnet_h5(
1298            args.peaks, args.chrom_order, args.h5s, args.out_path, args.region_width
1299        )
1300
1301    elif args.cmd == "extract-regions-modisco-fmt":
1302        extract_regions_modisco_fmt(
1303            args.peaks,
1304            args.chrom_order,
1305            args.attributions,
1306            args.sequences,
1307            args.out_path,
1308            args.region_width,
1309        )
1310
1311    elif args.cmd == "call-hits":
1312        if args.alpha is not None:
1313            warnings.warn(
1314                "The `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead."
1315            )
1316            args.global_lambda = args.alpha
1317        if args.motif_alphas is not None:
1318            warnings.warn(
1319                "The `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead."
1320            )
1321            args.motif_lambdas = args.motif_alphas
1322
1323        call_hits(
1324            args.regions,
1325            args.peaks,
1326            args.modisco_h5,
1327            args.chrom_order,
1328            args.motifs_include,
1329            args.motif_names,
1330            args.motif_lambdas,
1331            args.out_dir,
1332            args.cwm_trim_coords,
1333            args.cwm_trim_thresholds,
1334            args.cwm_trim_threshold,
1335            args.global_lambda,
1336            args.step_size_max,
1337            args.step_size_min,
1338            args.sqrt_transform,
1339            args.convergence_tol,
1340            args.max_steps,
1341            args.batch_size,
1342            args.step_adjust,
1343            args.device,
1344            args.mode,
1345            args.no_post_filter,
1346            args.compile,
1347        )
1348
1349    elif args.cmd == "report":
1350        report(
1351            args.regions,
1352            args.hits,
1353            args.modisco_h5,
1354            args.peaks,
1355            args.motifs_include,
1356            args.motif_names,
1357            args.out_dir,
1358            args.modisco_region_width,
1359            args.cwm_trim_threshold,
1360            not args.no_recall,
1361            not args.no_seqlets,
1362        )
1363
1364    elif args.cmd == "collapse-hits":
1365        collapse_hits(args.hits, args.out_path, args.overlap_frac)
1366
1367    elif args.cmd == "intersect-hits":
1368        intersect_hits(args.hits, args.out_path, args.relaxed)

Command-line interface for the Fi-NeMo motif instance calling pipeline.

This function provides the main entry point for all Fi-NeMo operations including:

  • Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
  • Motif hit calling using the Fi-NeMo algorithm
  • Report generation and visualization
  • Post-processing operations (hit collapsing, intersection)