面试题:在1亿个数据中取前10个最大的数据(Java实现)
处理大规模数据时获取前N个最大/最小元素是一个常见问题,以下是几种Java实现方案:
1. 使用优先队列(最小堆) - 最优方案
import java.util.PriorityQueue;public class Top10Elements {public static int[] findTop10(int[] nums) {// 使用最小堆,容量保持为10PriorityQueue<Integer> minHeap = new PriorityQueue<>(10);for (int num : nums) {if (minHeap.size() < 10) {minHeap.offer(num);} else if (num > minHeap.peek()) {minHeap.poll(); // 移除堆顶最小元素minHeap.offer(num); // 插入新元素}}// 将结果转为数组int[] result = new int[10];for (int i = 9; i >= 0; i--) {result[i] = minHeap.poll();}return result;}
}
时间复杂度:O(n log k),其中n是1亿,k是10。这种方法只需要O(n)的遍历时间和O(log k)的堆操作时间。
空间复杂度:O(k),只需要维护大小为10的堆。
2. 使用快速选择算法 - 适合内存有限情况
import java.util.Arrays;public class Top10Elements {public static int[] findTop10(int[] nums) {quickSelect(nums, 0, nums.length - 1, 10);// 此时前10个元素是最大的,但不一定有序int[] result = Arrays.copyOf(nums, 10);Arrays.sort(result); // 如果需要有序结果// 反转数组使从大到小排序for (int i = 0; i < 5; i++) {int temp = result[i];result[i] = result[9 - i];result[9 - i] = temp;}return result;}private static void quickSelect(int[] nums, int left, int right, int k) {if (left >= right) return;int pivot = partition(nums, left, right);if (pivot == k) {return;} else if (pivot < k) {quickSelect(nums, pivot + 1, right, k);} else {quickSelect(nums, left, pivot - 1, k);}}private static int partition(int[] nums, int left, int right) {int pivot = nums[right];int i = left;for (int j = left; j < right; j++) {if (nums[j] >= pivot) { // 降序排列swap(nums, i, j);i++;}}swap(nums, i, right);return i;}private static void swap(int[] nums, int i, int j) {int temp = nums[i];nums[i] = nums[j];nums[j] = temp;}
}
时间复杂度:平均O(n),最坏O(n^2),但实际应用中表现良好。
3. 并行处理方案(针对超大数据集)
import java.util.Arrays;
import java.util.PriorityQueue;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;public class ParallelTop10 {private static final int THRESHOLD = 1_000_000;public static int[] findTop10(int[] nums) {ForkJoinPool pool = new ForkJoinPool();return pool.invoke(new TopKTask(nums, 0, nums.length - 1, 10));}private static class TopKTask extends RecursiveTask<int[]> {private final int[] nums;private final int start, end;private final int k;TopKTask(int[] nums, int start, int end, int k) {this.nums = nums;this.start = start;this.end = end;this.k = k;}@Overrideprotected int[] compute() {if (end - start < THRESHOLD) {return sequentialTopK(nums, start, end, k);}int mid = start + (end - start) / 2;TopKTask left = new TopKTask(nums, start, mid, k);TopKTask right = new TopKTask(nums, mid + 1, end, k);left.fork();int[] rightResult = right.compute();int[] leftResult = left.join();return merge(leftResult, rightResult, k);}private int[] sequentialTopK(int[] nums, int start, int end, int k) {PriorityQueue<Integer> minHeap = new PriorityQueue<>(k);for (int i = start; i <= end; i++) {if (minHeap.size() < k) {minHeap.offer(nums[i]);} else if (nums[i] > minHeap.peek()) {minHeap.poll();minHeap.offer(nums[i]);}}int[] result = new int[k];for (int i = k - 1; i >= 0; i--) {result[i] = minHeap.poll();}return result;}private int[] merge(int[] a, int[] b, int k) {PriorityQueue<Integer> minHeap = new PriorityQueue<>(k);for (int num : a) {if (minHeap.size() < k) {minHeap.offer(num);} else if (num > minHeap.peek()) {minHeap.poll();minHeap.offer(num);}}for (int num : b) {if (minHeap.size() < k) {minHeap.offer(num);} else if (num > minHeap.peek()) {minHeap.poll();minHeap.offer(num);}}int[] result = new int[k];for (int i = k - 1; i >= 0; i--) {result[i] = minHeap.poll();}return result;}}
}
实际应用建议
-
内存足够:优先使用最小堆方案,实现简单且效率高
-
内存有限:考虑快速选择算法
-
超大数据集:使用并行处理方案
-
数据存储在外部:考虑分批读取+堆处理的方式
对于1亿个数据,最小堆方案通常是最佳选择,因为它只需要O(n)的时间和O(10)的额外空间。