# imports for the initial attempt
import heapq
import logging
# imports for the two-liner solution mentioned above
from collections import Counter, defaultdict
from typing import Callable, List
Problem Description
Difficulty: Medium
Given an integer array nums
and an integer k
, return the k most frequent elements within the array.
The test cases are generated such that the answer is always unique.
You may return the output in any order.
Example 1:
Input: nums = [1,2,2,3,3,3], k = 2
Output: [2,3]
Example 2:
Input: nums = [7,7], k = 1
Output: [7]
Initial Solution
My first thought on seeing this is that it’s such a common problem that no doubt someone has already written an efficient topK algorithm. Pandas has an nlargest()
function, maybe that would work? (not without some wrangling). Therefore, I decided I’d make a super simple algorithm without using any external libraries or googling for helpful functions.
After completing the exercise, I learned that heapq.nlargest()
is probably that helpful function you’re looking for, e.g., as shown in the ‘two-liner’ solution below. However, the most efficient answer is really nice, with O(n) time. We’ll see later…
# Set up logging config (global scope)
=logging.WARNING, format="%(levelname)s: %(message)s")
logging.basicConfig(level= logging.getLogger(__name__) logger
class InitialSolution:
def __init__(self):
self.logger = logger
def topKFrequent(self, nums: List[int], k: int) -> List[int]:
= defaultdict(int)
count_dict self.logger.debug(f"dict at initilization: {dict(count_dict)}")
for num in nums:
+= 1
count_dict[num] self.logger.debug(f"dict updated: {dict(count_dict)}")
= []
top_k_keys for _ in range(k):
= -1
highest_freq = None # Could use max() to determine this instead.
most_freq_key for key, freq in count_dict.items(): # nested loop results in O(k⋅n) time
if freq > highest_freq:
= key
most_freq_key = freq
highest_freq self.logger.debug(
f"most frequent key: {most_freq_key}, frequency: {highest_freq}"
)
top_k_keys.append(most_freq_key)
count_dict.pop(most_freq_key)self.logger.debug(
f"popped off most frequent and updated dict: {dict(count_dict)}"
)self.logger.debug(f"current top k: {top_k_keys}")
return top_k_keys
# Second attempt added here for testing:
def topKTwoLiner(self, nums: List[int], k: int) -> List[int]:
= Counter(nums)
count return heapq.nlargest(k, count, key=count.get)
# Example for testing purposes. Order of output does not matter.
= [1, 2, 2, 3, 3, 3], 2 # Output: [2,3]
nums, k = InitialSolution()
solution
logger.setLevel(logging.DEBUG) solution.topKFrequent(nums, k)
DEBUG: dict at initilization: {}
DEBUG: dict updated: {1: 1}
DEBUG: dict updated: {1: 1, 2: 1}
DEBUG: dict updated: {1: 1, 2: 2}
DEBUG: dict updated: {1: 1, 2: 2, 3: 1}
DEBUG: dict updated: {1: 1, 2: 2, 3: 2}
DEBUG: dict updated: {1: 1, 2: 2, 3: 3}
DEBUG: most frequent key: 3, frequency: 3
DEBUG: popped off most frequent and updated dict: {1: 1, 2: 2}
DEBUG: current top k: [3]
DEBUG: most frequent key: 2, frequency: 2
DEBUG: popped off most frequent and updated dict: {1: 1}
DEBUG: current top k: [3, 2]
[3, 2]
Tests
def test(fn: Callable[[List[int], int], List[int]]) -> None:
= [
test_cases # [nums], k, [expected]
1, 2, 2, 3, 3, 3], 2, [2, 3]),
([1, 1, 1, 2, 2, 3], 2, [1, 2]),
([-1, -1, -1, 2, 2, 3], 2, [-1, 2]),
([7, 7], 1, [7]),
([4, 4, 4, 6, 6, 2, 2, 2], 2, [4, 2]),
([
]
# Set logging level to suppress debug output during tests
logger.setLevel(logging.WARNING)
for nums, k, expected in test_cases:
= fn(nums, k)
output assert set(output) == set(
expectedf"Test failed for nums={nums}, k={k}. Expected {expected}, but got {output}"
), # set() ensures that the comparison is order independent
print("Tests Passed")
# Set logging level suppress debug output during tests
logger.setLevel(logging.WARNING)
= InitialSolution()
solution
test(solution.topKFrequent) test(solution.topKTwoLiner)
Tests Passed
Tests Passed
Initial Results
My intial solution passed the LeetCode/NeetCode submission. However, the runtime was super slow and beats only 6% of entries, although top 94% for memory.
We could speed it up a little by using Python’s in-built max() function to find the most frequent key, but we’d be better off with a different approach. For example, by sorting the dictionary and slicing off the top k elements of the sorted dictionary.
The solution has the following worst-case complexity:
- Time complexity: \(O(n + k⋅n)\); if \(k = n\) this becomes \(O(n^2)\).
- Space complexity: \(O(n)\) comprising \(O(n)\) for the dictionary plus \(O(k)\) to store the list of top k elements.
NeetCode Solution
class Solution:
def topKFrequent(self, nums: List[int], k: int) -> List[int]:
= {}
count = [[] for i in range(len(nums) + 1)]
freq
for n in nums:
= 1 + count.get(n, 0)
count[n] for n, c in count.items():
freq[c].append(n)
= []
res for i in range(len(freq) - 1, 0, -1):
for n in freq[i]:
res.append(n)if len(res) == k:
return res
= Solution()
solution test(solution.topKFrequent)
Tests Passed
Conclusion
Neetcode’s solution is very nice. The key insight is the use of bucket sort, which allows it to avoid typical O(n⋅log n) sorting and instead achieves linear time complexity. Note that freq has a length of len(nums) + 1, to accommodate frequencies from 0 to the maximum possible frequency (which is at most the length of the input). By using the frequency as an index into the freq list, it creates a natural ordering of elements by their frequencies without explicit sorting.
- Time complexity: \(O(n)\)
- Space complexity: \(O(n)\) comprising \(O(n + n + k)\)