Source code for poolparty.scan_ops.subseq_scan

"""Subsequence scan operation - extract subsequences at scanning positions."""

from numbers import Integral, Real

from ..pool import Pool
from ..types import CardsType, ModeType, Optional, PositionsType, RegionType, Union, beartype
from ..utils import validate_positions


[docs] @beartype def subseq_scan( pool: Union[Pool, str], subseq_length: Integral, positions: PositionsType = None, region: RegionType = None, prefix: Optional[str] = None, mode: ModeType = "random", num_states: Optional[Integral] = None, iter_order: Optional[Real] = None, cards: CardsType = None, _factory_name: Optional[str] = "subseq_scan", ) -> Pool: """ Extract subsequences of a specified length at scanning positions. Scans a region across the pool and extracts the region content, returning subsequences at each valid position. Parameters ---------- pool : Union[Pool, str] Parent pool or sequence string. subseq_length : Integral Length of subsequence to extract at each position. positions : PositionsType, default=None Positions to consider for the start of extraction (0-based). If None, all valid positions are used. region : RegionType, default=None Region to constrain the scan to. Can be a marker name or [start, stop] interval. If specified, positions are relative to the region start. prefix : Optional[str], default=None Prefix for sequence names in the resulting Pool. mode : ModeType, default='random' Selection mode: 'random' or 'sequential'. num_states : Optional[Integral], default=None Number of states. In sequential mode, overrides the computed count (cycling if greater, clipping if less). In random mode, if None defaults to 1 (pure random sampling). iter_order : Optional[Real], default=None Iteration order priority for the Operation. cards : CardsType, default=None Design card keys to include. Available keys: ``'position_index'``, ``'start'``, ``'end'``, ``'name'``, ``'region_seq'``. Returns ------- Pool A Pool yielding subsequences extracted at each allowed position. """ from ..fixed_ops.from_seq import from_seq from ..region_ops import extract_region, insert_tags # Convert string input to pool if needed pool = from_seq(pool, _factory_name=f"{_factory_name}(from_seq)") if isinstance(pool, str) else pool # If region is specified, extract subsequences only from that region if region is not None: if isinstance(region, str): # Region is a region name - extract region content first region_content = extract_region(pool, region) # Apply subseq_scan to the region content return _subseq_scan_impl( pool=region_content, subseq_length=subseq_length, positions=positions, prefix=prefix, mode=mode, num_states=num_states, iter_order=iter_order, cards=cards, _factory_name=_factory_name, ) else: # Region is [start, stop] - insert temporary tags, extract, then scan temp_region = "_subseq_scan_region" marked_pool = insert_tags( pool, region_name=temp_region, start=int(region[0]), stop=int(region[1]), ) region_content = extract_region(marked_pool, temp_region) return _subseq_scan_impl( pool=region_content, subseq_length=subseq_length, positions=positions, prefix=prefix, mode=mode, num_states=num_states, iter_order=iter_order, cards=cards, _factory_name=_factory_name, ) # No region specified - apply to entire pool return _subseq_scan_impl( pool=pool, subseq_length=subseq_length, positions=positions, prefix=prefix, mode=mode, num_states=num_states, iter_order=iter_order, cards=cards, _factory_name=_factory_name, )
def _subseq_scan_impl( pool: Pool, subseq_length: Integral, positions: PositionsType, prefix: Optional[str], mode: ModeType, num_states: Optional[Integral], iter_order: Optional[Real] = None, cards: CardsType = None, _factory_name: Optional[str] = "subseq_scan", ) -> Pool: """Core subseq scan implementation without region handling.""" from ..region_ops import extract_region, region_scan # Validate pool has defined seq_length pool_length = pool.seq_length if pool_length is None: raise ValueError("pool must have a defined seq_length") # Validate subseq_length if subseq_length <= 0: raise ValueError(f"subseq_length must be > 0, got {subseq_length}") if subseq_length > pool_length: raise ValueError(f"subseq_length ({subseq_length}) must be <= pool.seq_length ({pool_length})") # Calculate max position for region tag placement region_name = "_subseq" region_length = int(subseq_length) max_position = pool_length - subseq_length # Validate positions validated_positions = validate_positions(positions, max_position, min_position=0) # 1. Scan region tags across pool at specified positions marked = region_scan( pool, tag_name=region_name, region_length=region_length, positions=validated_positions, prefix=prefix, mode=mode, num_states=num_states, iter_order=iter_order, cards=cards, _factory_name=f"{_factory_name}(region_scan)", ) # 2. Extract region content as the result result = extract_region( marked, region_name, iter_order=iter_order, ) return result