public class ExternalSort
extends java.lang.Object
Goal: offer a generic external-memory sorting program in Java. It must be : - hackable (easy to adapt) - scalable to large files - sensibly efficient. This software is in the public domain. Usage: java org/apache/oak/commons/sort//ExternalSort somefile.txt out.txt You can change the default maximal number of temporary files with the -t flag: java org/apache/oak/commons/sort/ExternalSort somefile.txt out.txt -t 3 You can change the default maximum memory available with the -m flag: java org/apache/oak/commons/sort/ExternalSort somefile.txt out.txt -m 8192 For very large files, you might want to use an appropriate flag to allocate more memory to the Java VM: java -Xms2G org/apache/oak/commons/sort/ExternalSort somefile.txt out.txt By (in alphabetical order) Philippe Beaudoin, Eleftherios Chetzakis, Jon Elsas, Christan Grant, Daniel Haran, Daniel Lemire, Sugumaran Harikrishnan, Jerry Yang, First published: April 2010 originally posted at http://lemire.me/blog/archives/2010/04/01/external-memory-sorting-in-java/
| Modifier and Type | Field and Description |
|---|---|
static java.util.Comparator<java.lang.String> |
defaultcomparator |
| Constructor and Description |
|---|
ExternalSort() |
| Modifier and Type | Method and Description |
|---|---|
static void |
displayUsage() |
static long |
estimateBestSizeOfBlocks(java.io.File filetobesorted,
int maxtmpfiles,
long maxMemory) |
static void |
main(java.lang.String[] args) |
static <T> int |
merge(java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
boolean distinct,
java.util.List<org.apache.jackrabbit.oak.commons.sort.BinaryFileBuffer<T>> buffers,
java.util.function.Function<T,java.lang.String> typeToString)
This merges several BinaryFileBuffer to an output writer.
|
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
This merges a bunch of temporary flat files and deletes them on success or error.
|
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
This merges a bunch of temporary flat files and deletes them on success or error.
|
static int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile)
This merges a bunch of temporary flat files
|
static int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp)
This merges a bunch of temporary flat files
|
static int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
boolean distinct)
This merges a bunch of temporary flat files
|
static int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs)
This merges a bunch of temporary flat files
|
static int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct)
This merges a bunch of temporary flat files
|
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
boolean usegzip)
This merges a bunch of temporary flat files
|
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
Compression algorithm) |
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
This merges a bunch of temporary flat files and deletes them on success or error.
|
static <T> int |
mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType) |
static void |
sort(java.io.File input,
java.io.File output) |
static java.io.File |
sortAndSave(java.util.List<java.lang.String> tmplist,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory)
Sort a list and save it to a temporary file
|
static java.io.File |
sortAndSave(java.util.List<java.lang.String> tmplist,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
boolean usegzip)
Sort a list and save it to a temporary file
|
static <T> java.io.File |
sortAndSave(java.util.List<T> tmplist,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString)
Sort a list and save it to a temporary file
|
static <T> java.io.File |
sortAndSave(java.util.List<T> tmplist,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString)
Sort a list and save it to a temporary file
|
static <T> java.util.List<java.io.File> |
sortInBatch(java.io.BufferedReader fbr,
long actualFileSize,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType) |
static <T> java.util.List<java.io.File> |
sortInBatch(java.io.BufferedReader fbr,
long actualFileSize,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType) |
static java.util.List<java.io.File> |
sortInBatch(java.io.File file)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
boolean distinct)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm) |
static <T> java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
This will simply load the file by blocks of lines, then sort them in-memory, and write the
result to temporary files that have to be merged later.
|
static <T> java.util.List<java.io.File> |
sortInBatch(java.io.File file,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType) |
public static java.util.Comparator<java.lang.String> defaultcomparator
public static void sort(java.io.File input,
java.io.File output)
throws java.io.IOException
java.io.IOExceptionpublic static long estimateBestSizeOfBlocks(java.io.File filetobesorted,
int maxtmpfiles,
long maxMemory)
public static java.util.List<java.io.File> sortInBatch(java.io.File file)
throws java.io.IOException
file - some flat filejava.io.IOExceptionpublic static java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp)
throws java.io.IOException
file - some flat filecmp - string comparatorjava.io.IOExceptionpublic static java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
boolean distinct)
throws java.io.IOException
file - some flat filecmp - string comparatordistinct - Pass true if duplicate lines should be discarded.java.io.IOExceptionpublic static java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip)
throws java.io.IOException
file - some flat filecmp - string comparatormaxtmpfiles - maximal number of temporary filescs - character set to use (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - Pass true if duplicate lines should be discarded.numHeader - number of lines to preclude before sorting startsusegzip - use gzip compression for the temporary filesjava.io.IOExceptionpublic static java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm)
throws java.io.IOException
java.io.IOExceptionpublic static <T> java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
file - some flat filecmp - string comparatormaxtmpfiles - maximal number of temporary filescs - character set to use (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - Pass true if duplicate lines should be discarded.numHeader - number of lines to preclude before sorting startsusegzip - use gzip compression for the temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingstringToType - function to map custom type to string. Used for storing sorted content back to filejava.io.IOExceptionpublic static <T> java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
java.io.IOExceptionpublic static <T> java.util.List<java.io.File> sortInBatch(java.io.BufferedReader fbr,
long actualFileSize,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
java.io.IOExceptionpublic static <T> java.util.List<java.io.File> sortInBatch(java.io.BufferedReader fbr,
long actualFileSize,
java.util.Comparator<T> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
int numHeader,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
java.io.IOExceptionpublic static java.util.List<java.io.File> sortInBatch(java.io.File file,
java.util.Comparator<java.lang.String> cmp,
int maxtmpfiles,
long maxMemory,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct)
throws java.io.IOException
file - some flat filecmp - string comparatormaxtmpfiles - maximal number of temporary filescs - character set to use (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - Pass true if duplicate lines should be discarded.java.io.IOExceptionpublic static java.io.File sortAndSave(java.util.List<java.lang.String> tmplist,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory)
throws java.io.IOException
tmplist - data to be sortedcmp - string comparatorcs - charset to use for output (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)java.io.IOExceptionpublic static java.io.File sortAndSave(java.util.List<java.lang.String> tmplist,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
boolean usegzip)
throws java.io.IOException
tmplist - data to be sortedcmp - string comparatorcs - charset to use for output (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - Pass true if duplicate lines should be discarded.java.io.IOExceptionpublic static <T> java.io.File sortAndSave(java.util.List<T> tmplist,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString)
throws java.io.IOException
tmplist - data to be sortedcmp - string comparatorcs - charset to use for output (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - usegzip - assumes we used gzip compression for temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingjava.io.IOExceptionpublic static <T> java.io.File sortAndSave(java.util.List<T> tmplist,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
java.io.File tmpdirectory,
boolean distinct,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString)
throws java.io.IOException
tmplist - data to be sortedcmp - string comparatorcs - charset to use for output (can use Charset.defaultCharset())tmpdirectory - location of the temporary files (set to null for default location)distinct - algorithm - compression algorithm to use for the temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingjava.io.IOExceptionpublic static int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile)
throws java.io.IOException
files - outputfile - filejava.io.IOExceptionpublic static int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp)
throws java.io.IOException
files - outputfile - filejava.io.IOExceptionpublic static int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
boolean distinct)
throws java.io.IOException
files - outputfile - filejava.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
boolean usegzip)
throws java.io.IOException
files - The List of sorted Files to be merged.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)outputfile - The output File to merge the results to.cmp - The Comparator to use to compare Strings.cs - The Charset to be used for the byte to character conversion.append - Pass true if result should append to File instead of
overwrite. Default to be false for overloading methods.usegzip - assumes we used gzip compression for temporary filesjava.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
Compression algorithm)
throws java.io.IOException
java.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
files - The List of sorted Files to be merged.outputfile - The output File to merge the results to.cmp - The Comparator to use to compare Strings.cs - The Charset to be used for the byte to character conversion.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)append - Pass true if result should append to File instead of
overwrite. Default to be false for overloading methods.usegzip - assumes we used gzip compression for temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingstringToType - function to map custom type to string. Used for storing sorted content back to filejava.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean append,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
java.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
boolean usegzip,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
files - The List of sorted Files to be merged.fbw - Buffered writer used to store the sorted contentcmp - The Comparator to use to compare Strings.cs - The Charset to be used for the byte to character conversion.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)usegzip - assumes we used gzip compression for temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingstringToType - function to map custom type to string. Used for storing sorted content back to filejava.io.IOExceptionpublic static <T> int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
java.nio.charset.Charset cs,
boolean distinct,
Compression algorithm,
java.util.function.Function<T,java.lang.String> typeToString,
java.util.function.Function<java.lang.String,T> stringToType)
throws java.io.IOException
files - The List of sorted Files to be merged.fbw - Buffered writer used to store the sorted contentcmp - The Comparator to use to compare Strings.cs - The Charset to be used for the byte to character conversion.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)algorithm - algorithm for compression by default assumes we used gzip compression for temporary filestypeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingstringToType - function to map custom type to string. Used for storing sorted content back to filejava.io.IOExceptionpublic static <T> int merge(java.io.BufferedWriter fbw,
java.util.Comparator<T> cmp,
boolean distinct,
java.util.List<org.apache.jackrabbit.oak.commons.sort.BinaryFileBuffer<T>> buffers,
java.util.function.Function<T,java.lang.String> typeToString)
throws java.io.IOException
fbw - A buffer where we write the data.cmp - A comparator object that tells us how to sort the lines.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)buffers - Where the data should be read.typeToString - function to map string to custom type. User for coverting line to custom type for the
purpose of sortingjava.io.IOExceptionpublic static int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs,
boolean distinct)
throws java.io.IOException
files - The List of sorted Files to be merged.distinct - Pass true if duplicate lines should be discarded. (elchetz@gmail.com)outputfile - The output File to merge the results to.cmp - The Comparator to use to compare Strings.cs - The Charset to be used for the byte to character conversion.java.io.IOExceptionpublic static int mergeSortedFiles(java.util.List<java.io.File> files,
java.io.File outputfile,
java.util.Comparator<java.lang.String> cmp,
java.nio.charset.Charset cs)
throws java.io.IOException
files - outputfile - filecs - character set to use to load the stringsjava.io.IOExceptionpublic static void displayUsage()
public static void main(java.lang.String[] args)
throws java.io.IOException
java.io.IOExceptionCopyright © 2010 - 2023 Adobe. All Rights Reserved