algo-string-01

1.000

14/14 tests· algorithms

Challenge · difficulty 4/5

# Longest k-repeated substring

Implement a file **`solution.py`** containing a function `longest_k_repeated`:

```python
def longest_k_repeated(s: str, k: int) -> int:
    """Return the length of the longest substring of `s` that occurs
    at least `k` times."""
```

Given a string `s` and an integer `k >= 1`, return the **length of the longest
non-empty substring** of `s` that occurs **at least `k` times** in `s`. If no
non-empty substring occurs at least `k` times, return `0`.

## What counts as an occurrence

- An occurrence is a **distinct starting position** in `s`. A substring of
  length `L` occurs at position `i` iff `s[i:i+L]` equals it.
- **Overlaps are allowed.** For example, in `"aaaa"` the substring `"aaa"`
  occurs at positions `0` and `1`, so it occurs **2** times.
- The string is compared **exactly**, character by character; matching is
  **case-sensitive** and works over arbitrary Unicode characters.

## Precise definition

Return the largest `L >= 1` such that there exists a string `w` of length `L`
for which the number of indices `i` with `s[i:i+L] == w` is `>= k`. If no such
`L` exists, return `0`.

## Edge cases

- `k == 1`: every non-empty substring occurs at least once, so for a non-empty
  `s` the answer is `len(s)` (the whole string occurs once). For the empty
  string the answer is `0`.
- The empty string returns `0` for **every** `k`.
- If `k` exceeds the length of the longest single-character run and the string
  has no repeats at all (e.g. all-distinct characters), smaller substrings may
  still fail the threshold — return `0` when nothing qualifies.

## Worked examples

```python
assert longest_k_repeated("banana", 2) == 3    # "ana" occurs at 1 and 3 (overlap)
assert longest_k_repeated("banana", 3) == 1    # only single chars occur >= 3 times
assert longest_k_repeated("banana", 4) == 0
assert longest_k_repeated("aaa", 2) == 2       # "aa" at 0 and 1
assert longest_k_repeated("aaa", 3) == 1       # "a" x3
assert longest_k_repeated("abcabc", 1) == 6    # whole string, k==1
assert longest_k_repeated("abcdef", 2) == 0    # all distinct, nothing repeats
assert longest_k_repeated("", 5) == 0
```

## Efficiency

Inputs can be **large**: `len(s)` up to about `100000`. A naive approach that
enumerates every substring is `O(n^2)` in time and memory and will time out.
Aim for roughly `O(n)` or `O(n log n)`. (A suffix automaton, or a suffix array
with LCP, or binary-search-plus-hashing all work.)

## Constraints

- `1 <= k`
- `0 <= len(s) <= 100000`
- `s` consists of arbitrary characters (tests use printable ASCII).

tests/test_longest_k_repeated.py

import random

from solution import longest_k_repeated


def brute(s, k):
    """O(n^2) reference oracle for small strings."""
    n = len(s)
    if n == 0 or k < 1:
        return 0
    for L in range(n, 0, -1):
        seen = {}
        for i in range(n - L + 1):
            sub = s[i:i + L]
            c = seen.get(sub, 0) + 1
            seen[sub] = c
            if c >= k:
                return L
    return 0


def test_empty_string():
    assert longest_k_repeated("", 1) == 0
    assert longest_k_repeated("", 2) == 0
    assert longest_k_repeated("", 5) == 0


def test_k_one_is_whole_string():
    assert longest_k_repeated("a", 1) == 1
    assert longest_k_repeated("abc", 1) == 3
    assert longest_k_repeated("abcabc", 1) == 6


def test_no_repeat_returns_zero():
    # All distinct characters: nothing occurs twice.
    assert longest_k_repeated("abcdef", 2) == 0
    assert longest_k_repeated("a", 2) == 0
    assert longest_k_repeated("xyz", 3) == 0


def test_single_char_runs():
    # "aaa": 'a' x3, 'aa' x2, 'aaa' x1
    assert longest_k_repeated("aaa", 1) == 3
    assert longest_k_repeated("aaa", 2) == 2
    assert longest_k_repeated("aaa", 3) == 1
    assert longest_k_repeated("aaa", 4) == 0


def test_banana():
    # classic: "ana" occurs at positions 1 and 3 (overlapping)
    assert longest_k_repeated("banana", 2) == 3
    # "a" occurs 3 times, "an"/"na" twice, "ana" twice
    assert longest_k_repeated("banana", 3) == 1
    assert longest_k_repeated("banana", 4) == 0


def test_overlapping_counts():
    # "aaaa": "aaa" occurs at 0 and 1 -> length 3 for k=2
    assert longest_k_repeated("aaaa", 2) == 3
    assert longest_k_repeated("aaaa", 3) == 2
    assert longest_k_repeated("aaaa", 4) == 1


def test_disjoint_repeat():
    # "abcXabc": "abc" occurs twice, no overlap
    assert longest_k_repeated("abcXabc", 2) == 3
    assert longest_k_repeated("abcXabc", 3) == 0


def test_mixed_case_sensitive():
    # 'A' and 'a' are different characters.
    # "AaAa": "Aa" occurs at 0 and 2 (len 2); "AaA" occurs once, "aAa" once.
    assert longest_k_repeated("AaAa", 2) == 2


def test_period_two_medium():
    s = "ab" * 50
    n = len(s)
    # periodic with period 2: s[0..n-3] == s[2..n-1]
    assert longest_k_repeated(s, 2) == n - 2


def test_matches_brute_small_random():
    rng = random.Random(1234)
    for _ in range(400):
        n = rng.randint(0, 12)
        alpha = "ab" if rng.random() < 0.5 else "abc"
        s = "".join(rng.choice(alpha) for _ in range(n))
        for k in range(1, 6):
            assert longest_k_repeated(s, k) == brute(s, k), (s, k)


def test_matches_brute_larger_alphabet():
    rng = random.Random(99)
    for _ in range(150):
        n = rng.randint(0, 40)
        s = "".join(rng.choice("abcde") for _ in range(n))
        for k in range(1, 4):
            assert longest_k_repeated(s, k) == brute(s, k), (s, k)


def test_large_all_same():
    s = "a" * 100000
    assert longest_k_repeated(s, 2) == 99999
    assert longest_k_repeated(s, 100000) == 1
    assert longest_k_repeated(s, 100001) == 0


def test_large_period_two():
    s = "ab" * 50000
    n = len(s)  # 100000
    assert longest_k_repeated(s, 2) == n - 2


def test_large_no_long_repeat():
    # Random over a 10-char alphabet: the longest repeat must be short.
    rng = random.Random(7)
    s = "".join(rng.choice("abcdefghij") for _ in range(60000))
    ans = longest_k_repeated(s, 2)
    # There must be some repeat (pigeonhole on length-1 substrings), and it
    # should be far shorter than the whole string.
    assert 1 <= ans < 1000

System prompt

You are an expert programmer. Solve the task exactly as specified. Output your solution as fenced code blocks using the required file name(s) and the exact function/type signatures requested. Prefer correctness; do not include prose outside code unless asked.

Proposed solution

Looking at this problem, I need to find the longest substring that appears at least `k` times in the string.

Given the constraints (string length up to 100,000), I need an efficient approach. The key insight is to use binary search on the answer length combined with a rolling hash technique to efficiently count substring occurrences.

Here's my approach:
1. Binary search on the possible lengths L (from 1 to len(s))
2. For each length L, check if any substring of length L appears at least k times
3. Use rolling hash (Rabin-Karp) to efficiently count occurrences of all substrings of a given length

The binary search will give us O(log n) iterations, and each iteration takes O(n) time with rolling hash, giving us O(n log n) overall.

```python
def longest_k_repeated(s: str, k: int) -> int:
    if not s or k <= 0:
        return 0
    
    n = len(s)
    
    # For k=1, the whole string occurs once, so answer is n
    if k == 1:
        return n
    
    # Prime base and modulus for rolling hash
    BASE = 31
    MOD = 10**18 + 9  # Large prime modulus
    
    def has_k_repeated_substring(length: int) -> bool:
        """Check if any substring of given length appears at least k times."""
        if length == 0:
            return False
        
        # Dictionary to count hash occurrences
        hash_count = {}
        
        # Calculate hash for first substring of this length
        current_hash = 0
        base_power = 1
        
        # Calculate BASE^(length-1) mod MOD
        for _ in range(length - 1):
            base_power = (base_power * BASE) % MOD
        
        # Calculate initial hash
        for i in range(length):
            current_hash = (current_hash * BASE + ord(s[i])) % MOD
        
        hash_count[current_hash] = 1
        
        # Rolling hash for remaining substrings
        for i in range(length, n):
            # Remove leftmost character and add new rightmost character
            left_char_val = ord(s[i - length])
            new_char_val = ord(s[i])
            
            # Remove contribution of leftmost char: subtract left_char * BASE^(length-1)
            current_hash = (current_hash - left_char_val * base_power) % MOD
            
            # Multiply by BASE and add new character
            current_hash = (current_hash * BASE + new_char_val) % MOD
            
            hash_count[current_hash] = hash_count.get(current_hash, 0) + 1
            
            if hash_count[current_hash] >= k:
                return True
        
        return False
    
    # Binary search for the maximum length
    left, right = 1, n
    result = 0
    
    while left <= right:
        mid = (left + right) // 2
        if has_k_repeated_substring(mid):
            result = mid
            left = mid + 1
        else:
            right = mid - 1
    
    return result
```

Test output (stdout)

..............                                                           [100%]
14 passed in 1.06s