* Copyright © 2014 - 2021 Leipzig University (Database Research Group)
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.gradoop.flink.model.impl.operators.sampling;
import org.gradoop.common.model.api.entities.Edge;
import org.gradoop.common.model.api.entities.GraphHead;
import org.gradoop.common.model.api.entities.Vertex;
import org.gradoop.flink.algorithms.gelly.pagerank.PageRank;
import org.gradoop.flink.model.api.epgm.BaseGraph;
import org.gradoop.flink.model.api.epgm.BaseGraphCollection;
import org.gradoop.flink.model.impl.operators.aggregation.functions.count.VertexCount;
import org.gradoop.flink.model.impl.operators.aggregation.functions.max.MaxVertexProperty;
import org.gradoop.flink.model.impl.operators.aggregation.functions.min.MinVertexProperty;
import org.gradoop.flink.model.impl.operators.aggregation.functions.sum.SumVertexProperty;
import org.gradoop.flink.model.impl.operators.sampling.common.SamplingConstants;
import org.gradoop.flink.model.impl.operators.sampling.functions.AddPageRankScoresToVertexCrossFunction;
import org.gradoop.flink.model.impl.operators.sampling.functions.PageRankResultVertexFilter;
* Computes a PageRank-Sampling of the graph (new graph head will be generated).
* Uses the Gradoop-Wrapper of Flinks PageRank-algorithm {@link PageRank} with a dampening factor
* and a number of maximum iterations. It computes a per-vertex score which is the sum of the
* PageRank-scores transmitted over all in-edges. The score of each vertex is divided evenly
* among its out-edges.
* The PageRank-algorithm is called with {@code setIncludeZeroDegreeVertices(true)}.
* If vertices got different PageRank-scores, all scores are scaled in a range between 0 and 1.
* Then it retains all vertices with a PageRank-score greater or equal/smaller than a given
* sampling threshold - depending on the Boolean set in {@code sampleGreaterThanThreshold}.
* If ALL vertices got the same PageRank-score, it can be decided whether to sample all vertices
* or none of them - depending on the Boolean set in {@code keepVerticesIfSameScore}.
* Retains all edges which source- and target-vertices were chosen. There may retain some
* unconnected vertices in the sampled graph.
* @param <G> The graph head type.
* @param <V> The vertex type.
* @param <E> The edge type.
* @param <LG> The type of the graph.
* @param <GC> The type of the graph collection.
public class PageRankSampling<
G extends GraphHead,
V extends Vertex,
E extends Edge,
LG extends BaseGraph<G, V, E, LG, GC>,
GC extends BaseGraphCollection<G, V, E, LG, GC>> extends SamplingAlgorithm<G, V, E, LG, GC> {
* Dampening factor used by PageRank-algorithm
private final double dampeningFactor;
* Number of iterations used by PageRank-algorithm
private final int maxIteration;
* Sampling threshold for PageRankScore
private final double threshold;
* Whether to sample vertices with PageRank-score greater (true) or equal/smaller (false)
* than the threshold
private final boolean sampleGreaterThanThreshold;
* Whether to sample all vertices (true) or none of them (false), in case all vertices got the
* same PageRank-score.
private final boolean keepVerticesIfSameScore;
* Creates a new PageRankSampling instance.
* @param dampeningFactor The dampening factor used by PageRank-algorithm, e.g. 0.85
* @param maxIteration The number of iterations used by PageRank-algorithm, e.g. 40
* @param threshold The threshold for the PageRank-score (ranging between 0 and 1 when scaled),
* determining if a vertex is sampled, e.g. 0.5
* @param sampleGreaterThanThreshold Whether to sample vertices with a PageRank-score
* greater (true) or equal/smaller (false) the threshold
* @param keepVerticesIfSameScore Whether to sample all vertices (true) or none of them (false)
* in case all vertices got the same PageRank-score.
public PageRankSampling(double dampeningFactor, int maxIteration, double threshold,
boolean sampleGreaterThanThreshold, boolean keepVerticesIfSameScore) {
this.dampeningFactor = dampeningFactor;
this.threshold = threshold;
this.maxIteration = maxIteration;
this.sampleGreaterThanThreshold = sampleGreaterThanThreshold;
this.keepVerticesIfSameScore = keepVerticesIfSameScore;
public LG sample(LG graph) {
LG pageRankGraph = graph.callForGraph(new PageRank<>(
graph = graph.getFactory().fromDataSets(
graph.getGraphHead(), pageRankGraph.getVertices(), pageRankGraph.getEdges());
graph = graph
.aggregate(new MinVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new MaxVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new SumVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new VertexCount());
DataSet<V> scaledVertices = graph.getVertices()
.with(new AddPageRankScoresToVertexCrossFunction<>())
.filter(new PageRankResultVertexFilter<>(threshold, sampleGreaterThanThreshold,
return graph.getFactory().fromDataSets(scaledVertices, graph.getEdges()).verify();