PageRankSampling.java
/*
* Copyright © 2014 - 2021 Leipzig University (Database Research Group)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gradoop.flink.model.impl.operators.sampling;
import org.apache.flink.api.java.DataSet;
import org.gradoop.common.model.api.entities.Edge;
import org.gradoop.common.model.api.entities.GraphHead;
import org.gradoop.common.model.api.entities.Vertex;
import org.gradoop.flink.algorithms.gelly.pagerank.PageRank;
import org.gradoop.flink.model.api.epgm.BaseGraph;
import org.gradoop.flink.model.api.epgm.BaseGraphCollection;
import org.gradoop.flink.model.impl.operators.aggregation.functions.count.VertexCount;
import org.gradoop.flink.model.impl.operators.aggregation.functions.max.MaxVertexProperty;
import org.gradoop.flink.model.impl.operators.aggregation.functions.min.MinVertexProperty;
import org.gradoop.flink.model.impl.operators.aggregation.functions.sum.SumVertexProperty;
import org.gradoop.flink.model.impl.operators.sampling.common.SamplingConstants;
import org.gradoop.flink.model.impl.operators.sampling.functions.AddPageRankScoresToVertexCrossFunction;
import org.gradoop.flink.model.impl.operators.sampling.functions.PageRankResultVertexFilter;
/**
* Computes a PageRank-Sampling of the graph (new graph head will be generated).
*
* Uses the Gradoop-Wrapper of Flinks PageRank-algorithm {@link PageRank} with a dampening factor
* and a number of maximum iterations. It computes a per-vertex score which is the sum of the
* PageRank-scores transmitted over all in-edges. The score of each vertex is divided evenly
* among its out-edges.
* The PageRank-algorithm is called with {@code setIncludeZeroDegreeVertices(true)}.
*
* If vertices got different PageRank-scores, all scores are scaled in a range between 0 and 1.
* Then it retains all vertices with a PageRank-score greater or equal/smaller than a given
* sampling threshold - depending on the Boolean set in {@code sampleGreaterThanThreshold}.
*
* If ALL vertices got the same PageRank-score, it can be decided whether to sample all vertices
* or none of them - depending on the Boolean set in {@code keepVerticesIfSameScore}.
*
* Retains all edges which source- and target-vertices were chosen. There may retain some
* unconnected vertices in the sampled graph.
*
* @param <G> The graph head type.
* @param <V> The vertex type.
* @param <E> The edge type.
* @param <LG> The type of the graph.
* @param <GC> The type of the graph collection.
*/
public class PageRankSampling<
G extends GraphHead,
V extends Vertex,
E extends Edge,
LG extends BaseGraph<G, V, E, LG, GC>,
GC extends BaseGraphCollection<G, V, E, LG, GC>> extends SamplingAlgorithm<G, V, E, LG, GC> {
/**
* Dampening factor used by PageRank-algorithm
*/
private final double dampeningFactor;
/**
* Number of iterations used by PageRank-algorithm
*/
private final int maxIteration;
/**
* Sampling threshold for PageRankScore
*/
private final double threshold;
/**
* Whether to sample vertices with PageRank-score greater (true) or equal/smaller (false)
* than the threshold
*/
private final boolean sampleGreaterThanThreshold;
/**
* Whether to sample all vertices (true) or none of them (false), in case all vertices got the
* same PageRank-score.
*/
private final boolean keepVerticesIfSameScore;
/**
* Creates a new PageRankSampling instance.
*
* @param dampeningFactor The dampening factor used by PageRank-algorithm, e.g. 0.85
* @param maxIteration The number of iterations used by PageRank-algorithm, e.g. 40
* @param threshold The threshold for the PageRank-score (ranging between 0 and 1 when scaled),
* determining if a vertex is sampled, e.g. 0.5
* @param sampleGreaterThanThreshold Whether to sample vertices with a PageRank-score
* greater (true) or equal/smaller (false) the threshold
* @param keepVerticesIfSameScore Whether to sample all vertices (true) or none of them (false)
* in case all vertices got the same PageRank-score.
*/
public PageRankSampling(double dampeningFactor, int maxIteration, double threshold,
boolean sampleGreaterThanThreshold, boolean keepVerticesIfSameScore) {
this.dampeningFactor = dampeningFactor;
this.threshold = threshold;
this.maxIteration = maxIteration;
this.sampleGreaterThanThreshold = sampleGreaterThanThreshold;
this.keepVerticesIfSameScore = keepVerticesIfSameScore;
}
@Override
public LG sample(LG graph) {
LG pageRankGraph = graph.callForGraph(new PageRank<>(
SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY,
dampeningFactor,
maxIteration,
true));
graph = graph.getFactory().fromDataSets(
graph.getGraphHead(), pageRankGraph.getVertices(), pageRankGraph.getEdges());
graph = graph
.aggregate(new MinVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new MaxVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new SumVertexProperty(SamplingConstants.PAGE_RANK_SCORE_PROPERTY_KEY),
new VertexCount());
DataSet<V> scaledVertices = graph.getVertices()
.crossWithTiny(graph.getGraphHead().first(1))
.with(new AddPageRankScoresToVertexCrossFunction<>())
.filter(new PageRankResultVertexFilter<>(threshold, sampleGreaterThanThreshold,
keepVerticesIfSameScore));
return graph.getFactory().fromDataSets(scaledVertices, graph.getEdges()).verify();
}
}