algo-string-01
1.000
Challenge · difficulty 4/5
# Longest k-repeated substring
Implement a file **`solution.py`** containing a function `longest_k_repeated`:
```python
def longest_k_repeated(s: str, k: int) -> int:
"""Return the length of the longest substring of `s` that occurs
at least `k` times."""
```
Given a string `s` and an integer `k >= 1`, return the **length of the longest
non-empty substring** of `s` that occurs **at least `k` times** in `s`. If no
non-empty substring occurs at least `k` times, return `0`.
## What counts as an occurrence
- An occurrence is a **distinct starting position** in `s`. A substring of
length `L` occurs at position `i` iff `s[i:i+L]` equals it.
- **Overlaps are allowed.** For example, in `"aaaa"` the substring `"aaa"`
occurs at positions `0` and `1`, so it occurs **2** times.
- The string is compared **exactly**, character by character; matching is
**case-sensitive** and works over arbitrary Unicode characters.
## Precise definition
Return the largest `L >= 1` such that there exists a string `w` of length `L`
for which the number of indices `i` with `s[i:i+L] == w` is `>= k`. If no such
`L` exists, return `0`.
## Edge cases
- `k == 1`: every non-empty substring occurs at least once, so for a non-empty
`s` the answer is `len(s)` (the whole string occurs once). For the empty
string the answer is `0`.
- The empty string returns `0` for **every** `k`.
- If `k` exceeds the length of the longest single-character run and the string
has no repeats at all (e.g. all-distinct characters), smaller substrings may
still fail the threshold — return `0` when nothing qualifies.
## Worked examples
```python
assert longest_k_repeated("banana", 2) == 3 # "ana" occurs at 1 and 3 (overlap)
assert longest_k_repeated("banana", 3) == 1 # only single chars occur >= 3 times
assert longest_k_repeated("banana", 4) == 0
assert longest_k_repeated("aaa", 2) == 2 # "aa" at 0 and 1
assert longest_k_repeated("aaa", 3) == 1 # "a" x3
assert longest_k_repeated("abcabc", 1) == 6 # whole string, k==1
assert longest_k_repeated("abcdef", 2) == 0 # all distinct, nothing repeats
assert longest_k_repeated("", 5) == 0
```
## Efficiency
Inputs can be **large**: `len(s)` up to about `100000`. A naive approach that
enumerates every substring is `O(n^2)` in time and memory and will time out.
Aim for roughly `O(n)` or `O(n log n)`. (A suffix automaton, or a suffix array
with LCP, or binary-search-plus-hashing all work.)
## Constraints
- `1 <= k`
- `0 <= len(s) <= 100000`
- `s` consists of arbitrary characters (tests use printable ASCII).tests/test_longest_k_repeated.py
import random
from solution import longest_k_repeated
def brute(s, k):
"""O(n^2) reference oracle for small strings."""
n = len(s)
if n == 0 or k < 1:
return 0
for L in range(n, 0, -1):
seen = {}
for i in range(n - L + 1):
sub = s[i:i + L]
c = seen.get(sub, 0) + 1
seen[sub] = c
if c >= k:
return L
return 0
def test_empty_string():
assert longest_k_repeated("", 1) == 0
assert longest_k_repeated("", 2) == 0
assert longest_k_repeated("", 5) == 0
def test_k_one_is_whole_string():
assert longest_k_repeated("a", 1) == 1
assert longest_k_repeated("abc", 1) == 3
assert longest_k_repeated("abcabc", 1) == 6
def test_no_repeat_returns_zero():
# All distinct characters: nothing occurs twice.
assert longest_k_repeated("abcdef", 2) == 0
assert longest_k_repeated("a", 2) == 0
assert longest_k_repeated("xyz", 3) == 0
def test_single_char_runs():
# "aaa": 'a' x3, 'aa' x2, 'aaa' x1
assert longest_k_repeated("aaa", 1) == 3
assert longest_k_repeated("aaa", 2) == 2
assert longest_k_repeated("aaa", 3) == 1
assert longest_k_repeated("aaa", 4) == 0
def test_banana():
# classic: "ana" occurs at positions 1 and 3 (overlapping)
assert longest_k_repeated("banana", 2) == 3
# "a" occurs 3 times, "an"/"na" twice, "ana" twice
assert longest_k_repeated("banana", 3) == 1
assert longest_k_repeated("banana", 4) == 0
def test_overlapping_counts():
# "aaaa": "aaa" occurs at 0 and 1 -> length 3 for k=2
assert longest_k_repeated("aaaa", 2) == 3
assert longest_k_repeated("aaaa", 3) == 2
assert longest_k_repeated("aaaa", 4) == 1
def test_disjoint_repeat():
# "abcXabc": "abc" occurs twice, no overlap
assert longest_k_repeated("abcXabc", 2) == 3
assert longest_k_repeated("abcXabc", 3) == 0
def test_mixed_case_sensitive():
# 'A' and 'a' are different characters.
# "AaAa": "Aa" occurs at 0 and 2 (len 2); "AaA" occurs once, "aAa" once.
assert longest_k_repeated("AaAa", 2) == 2
def test_period_two_medium():
s = "ab" * 50
n = len(s)
# periodic with period 2: s[0..n-3] == s[2..n-1]
assert longest_k_repeated(s, 2) == n - 2
def test_matches_brute_small_random():
rng = random.Random(1234)
for _ in range(400):
n = rng.randint(0, 12)
alpha = "ab" if rng.random() < 0.5 else "abc"
s = "".join(rng.choice(alpha) for _ in range(n))
for k in range(1, 6):
assert longest_k_repeated(s, k) == brute(s, k), (s, k)
def test_matches_brute_larger_alphabet():
rng = random.Random(99)
for _ in range(150):
n = rng.randint(0, 40)
s = "".join(rng.choice("abcde") for _ in range(n))
for k in range(1, 4):
assert longest_k_repeated(s, k) == brute(s, k), (s, k)
def test_large_all_same():
s = "a" * 100000
assert longest_k_repeated(s, 2) == 99999
assert longest_k_repeated(s, 100000) == 1
assert longest_k_repeated(s, 100001) == 0
def test_large_period_two():
s = "ab" * 50000
n = len(s) # 100000
assert longest_k_repeated(s, 2) == n - 2
def test_large_no_long_repeat():
# Random over a 10-char alphabet: the longest repeat must be short.
rng = random.Random(7)
s = "".join(rng.choice("abcdefghij") for _ in range(60000))
ans = longest_k_repeated(s, 2)
# There must be some repeat (pigeonhole on length-1 substrings), and it
# should be far shorter than the whole string.
assert 1 <= ans < 1000
System prompt
You are an expert programmer. Solve the task exactly as specified. Output your solution as fenced code blocks using the required file name(s) and the exact function/type signatures requested. Prefer correctness; do not include prose outside code unless asked.
Proposed solution
Looking at this problem, I need to find the longest substring that appears at least `k` times in the string.
Given the constraints (string length up to 100,000), I need an efficient approach. The key insight is to use binary search on the answer length combined with a rolling hash technique to efficiently count substring occurrences.
Here's my approach:
1. Binary search on the possible lengths L (from 1 to len(s))
2. For each length L, check if any substring of length L appears at least k times
3. Use rolling hash (Rabin-Karp) to efficiently count occurrences of all substrings of a given length
The binary search will give us O(log n) iterations, and each iteration takes O(n) time with rolling hash, giving us O(n log n) overall.
```python
def longest_k_repeated(s: str, k: int) -> int:
if not s or k <= 0:
return 0
n = len(s)
# For k=1, the whole string occurs once, so answer is n
if k == 1:
return n
# Prime base and modulus for rolling hash
BASE = 31
MOD = 10**18 + 9 # Large prime modulus
def has_k_repeated_substring(length: int) -> bool:
"""Check if any substring of given length appears at least k times."""
if length == 0:
return False
# Dictionary to count hash occurrences
hash_count = {}
# Calculate hash for first substring of this length
current_hash = 0
base_power = 1
# Calculate BASE^(length-1) mod MOD
for _ in range(length - 1):
base_power = (base_power * BASE) % MOD
# Calculate initial hash
for i in range(length):
current_hash = (current_hash * BASE + ord(s[i])) % MOD
hash_count[current_hash] = 1
# Rolling hash for remaining substrings
for i in range(length, n):
# Remove leftmost character and add new rightmost character
left_char_val = ord(s[i - length])
new_char_val = ord(s[i])
# Remove contribution of leftmost char: subtract left_char * BASE^(length-1)
current_hash = (current_hash - left_char_val * base_power) % MOD
# Multiply by BASE and add new character
current_hash = (current_hash * BASE + new_char_val) % MOD
hash_count[current_hash] = hash_count.get(current_hash, 0) + 1
if hash_count[current_hash] >= k:
return True
return False
# Binary search for the maximum length
left, right = 1, n
result = 0
while left <= right:
mid = (left + right) // 2
if has_k_repeated_substring(mid):
result = mid
left = mid + 1
else:
right = mid - 1
return result
```Test output (stdout)
.............. [100%] 14 passed in 1.06s