@Nondeterministic
public class ReservoirSample
extends org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
implements org.apache.pig.Algebraic
This is similar to SimpleRandomSample
, however it is guaranteed to produce
a sample of the given size. This comes at the cost of scalability.
SimpleRandomSample
produces a sample of the desired size with likelihood of 99.99%,
while using less internal storage. ReservoirSample on the other hand uses internal storage
with size equaling the desired sample to guarantee the exact sample size.
This algebraic implementation is backed by a heap and maintains the original roll in order to compensate for skew.
Modifier and Type | Class and Description |
---|---|
static class |
ReservoirSample.Final |
static class |
ReservoirSample.Initial |
static class |
ReservoirSample.Intermediate |
Modifier and Type | Field and Description |
---|---|
protected java.lang.Integer |
numSamples |
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
scoreGen |
Constructor and Description |
---|
ReservoirSample(java.lang.String numSamples) |
Modifier and Type | Method and Description |
---|---|
void |
accumulate(org.apache.pig.data.Tuple input) |
void |
cleanup() |
org.apache.pig.data.DataBag |
exec(org.apache.pig.data.Tuple input) |
java.lang.String |
getFinal() |
java.lang.String |
getInitial() |
java.lang.String |
getIntermed() |
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
getScoreGenerator() |
org.apache.pig.data.DataBag |
getValue() |
org.apache.pig.impl.logicalLayer.schema.Schema |
outputSchema(org.apache.pig.impl.logicalLayer.schema.Schema input) |
allowCompileTimeCalculation, finish, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, progress, setInputSchema, setPigLogger, setReporter, setUDFContextSignature, warn
protected java.lang.Integer numSamples
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator scoreGen
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator getScoreGenerator()
public void accumulate(org.apache.pig.data.Tuple input) throws java.io.IOException
accumulate
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
accumulate
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
java.io.IOException
public void cleanup()
cleanup
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
cleanup
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
public org.apache.pig.data.DataBag getValue()
getValue
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
getValue
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
public org.apache.pig.data.DataBag exec(org.apache.pig.data.Tuple input) throws java.io.IOException
exec
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
java.io.IOException
public org.apache.pig.impl.logicalLayer.schema.Schema outputSchema(org.apache.pig.impl.logicalLayer.schema.Schema input)
outputSchema
in class org.apache.pig.EvalFunc<org.apache.pig.data.DataBag>
public java.lang.String getInitial()
getInitial
in interface org.apache.pig.Algebraic
public java.lang.String getIntermed()
getIntermed
in interface org.apache.pig.Algebraic
public java.lang.String getFinal()
getFinal
in interface org.apache.pig.Algebraic