# imports for the initial attempt
import heapq
import logging
# imports for the two-liner solution mentioned above
from collections import Counter, defaultdict
from typing import Callable, ListProblem Description
Difficulty: Medium
Given an integer array nums and an integer k, return the k most frequent elements within the array.
The test cases are generated such that the answer is always unique.
You may return the output in any order.
Example 1:
Input: nums = [1,2,2,3,3,3], k = 2
Output: [2,3]
Example 2:
Input: nums = [7,7], k = 1
Output: [7]
Initial Solution
My first thought on seeing this is that it’s such a common problem that no doubt someone has already written an efficient topK algorithm. Pandas has an nlargest() function, maybe that would work? (not without some wrangling). Therefore, I decided I’d make a super simple algorithm without using any external libraries or googling for helpful functions.
After completing the exercise, I learned that heapq.nlargest() is probably that helpful function you’re looking for, e.g., as shown in the ‘two-liner’ solution below. However, the most efficient answer is really nice, with O(n) time. We’ll see later…
# Set up logging config (global scope)
logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)class InitialSolution:
def __init__(self):
self.logger = logger
def topKFrequent(self, nums: List[int], k: int) -> List[int]:
count_dict = defaultdict(int)
self.logger.debug(f"dict at initilization: {dict(count_dict)}")
for num in nums:
count_dict[num] += 1
self.logger.debug(f"dict updated: {dict(count_dict)}")
top_k_keys = []
for _ in range(k):
highest_freq = -1
most_freq_key = None # Could use max() to determine this instead.
for key, freq in count_dict.items(): # nested loop results in O(k⋅n) time
if freq > highest_freq:
most_freq_key = key
highest_freq = freq
self.logger.debug(
f"most frequent key: {most_freq_key}, frequency: {highest_freq}"
)
top_k_keys.append(most_freq_key)
count_dict.pop(most_freq_key)
self.logger.debug(
f"popped off most frequent and updated dict: {dict(count_dict)}"
)
self.logger.debug(f"current top k: {top_k_keys}")
return top_k_keys
# Second attempt added here for testing:
def topKTwoLiner(self, nums: List[int], k: int) -> List[int]:
count = Counter(nums)
return heapq.nlargest(k, count, key=count.get)# Example for testing purposes. Order of output does not matter.
nums, k = [1, 2, 2, 3, 3, 3], 2 # Output: [2,3]
solution = InitialSolution()
logger.setLevel(logging.DEBUG)
solution.topKFrequent(nums, k)DEBUG: dict at initilization: {}
DEBUG: dict updated: {1: 1}
DEBUG: dict updated: {1: 1, 2: 1}
DEBUG: dict updated: {1: 1, 2: 2}
DEBUG: dict updated: {1: 1, 2: 2, 3: 1}
DEBUG: dict updated: {1: 1, 2: 2, 3: 2}
DEBUG: dict updated: {1: 1, 2: 2, 3: 3}
DEBUG: most frequent key: 3, frequency: 3
DEBUG: popped off most frequent and updated dict: {1: 1, 2: 2}
DEBUG: current top k: [3]
DEBUG: most frequent key: 2, frequency: 2
DEBUG: popped off most frequent and updated dict: {1: 1}
DEBUG: current top k: [3, 2]
[3, 2]
Tests
def test(fn: Callable[[List[int], int], List[int]]) -> None:
test_cases = [
# [nums], k, [expected]
([1, 2, 2, 3, 3, 3], 2, [2, 3]),
([1, 1, 1, 2, 2, 3], 2, [1, 2]),
([-1, -1, -1, 2, 2, 3], 2, [-1, 2]),
([7, 7], 1, [7]),
([4, 4, 4, 6, 6, 2, 2, 2], 2, [4, 2]),
]
# Set logging level to suppress debug output during tests
logger.setLevel(logging.WARNING)
for nums, k, expected in test_cases:
output = fn(nums, k)
assert set(output) == set(
expected
), f"Test failed for nums={nums}, k={k}. Expected {expected}, but got {output}"
# set() ensures that the comparison is order independent
print("Tests Passed")# Set logging level suppress debug output during tests
logger.setLevel(logging.WARNING)
solution = InitialSolution()
test(solution.topKFrequent)
test(solution.topKTwoLiner)Tests Passed
Tests Passed
Initial Results
My intial solution passed the LeetCode/NeetCode submission. However, the runtime was super slow and beats only 6% of entries, although top 94% for memory.
We could speed it up a little by using Python’s in-built max() function to find the most frequent key, but we’d be better off with a different approach. For example, by sorting the dictionary and slicing off the top k elements of the sorted dictionary.
The solution has the following worst-case complexity:
- Time complexity: \(O(n + k⋅n)\); if \(k = n\) this becomes \(O(n^2)\).
- Space complexity: \(O(n)\) comprising \(O(n)\) for the dictionary plus \(O(k)\) to store the list of top k elements.
NeetCode Solution
class Solution:
def topKFrequent(self, nums: List[int], k: int) -> List[int]:
count = {}
freq = [[] for i in range(len(nums) + 1)]
for n in nums:
count[n] = 1 + count.get(n, 0)
for n, c in count.items():
freq[c].append(n)
res = []
for i in range(len(freq) - 1, 0, -1):
for n in freq[i]:
res.append(n)
if len(res) == k:
return ressolution = Solution()
test(solution.topKFrequent)Tests Passed
Conclusion
Neetcode’s solution is very nice. The key insight is the use of bucket sort, which allows it to avoid typical O(n⋅log n) sorting and instead achieves linear time complexity. Note that freq has a length of len(nums) + 1, to accommodate frequencies from 0 to the maximum possible frequency (which is at most the length of the input). By using the frequency as an index into the freq list, it creates a natural ordering of elements by their frequencies without explicit sorting.
- Time complexity: \(O(n)\)
- Space complexity: \(O(n)\) comprising \(O(n + n + k)\)