Source code for pgl.sampling.sage

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
    This package implement graph sampling algorithm.
"""
import time
import copy

import numpy as np
from pgl import graph_kernel

from pgl.sampling.custom import subgraph

__all__ = ['graphsage_sample', ]


def traverse(item):
    """traverse the list or numpy"""
    if isinstance(item, list) or isinstance(item, np.ndarray):
        for i in iter(item):
            for j in traverse(i):
                yield j
    else:
        yield item


def flat_node_and_edge(nodes, eids, weights=None):
    """flatten the sub-lists to one list"""
    nodes = list(set(traverse(nodes)))
    eids = list(traverse(eids))
    if weights is not None:
        weights = list(traverse(weights))
    return nodes, eids, weights


def edge_hash(src, dst):
    """edge_hash
    """
    return src * 100000007 + dst


[docs]def graphsage_sample(graph, nodes, samples, ignore_edges=[]): """Implement of graphsage sample. Reference paper: https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf. Args: graph: A pgl graph instance nodes: Sample starting from nodes samples: A list, number of neighbors in each layer ignore_edges: list of edge(src, dst) will be ignored. Return: A list of subgraphs """ assert not graph.is_tensor(), "You must call Graph.numpy() first." node_index = copy.deepcopy(nodes) start = time.time() num_layers = len(samples) start_nodes = nodes nodes = list(start_nodes) eids, edges = [], [] nodes_set = set(nodes) layer_nodes, layer_eids, layer_edges = [], [], [] ignore_edge_set = set([edge_hash(src, dst) for src, dst in ignore_edges]) for layer_idx in reversed(range(num_layers)): if len(start_nodes) == 0: layer_nodes = [nodes] + layer_nodes layer_eids = [eids] + layer_eids layer_edges = [edges] + layer_edges continue batch_pred_nodes, batch_pred_eids = graph.sample_predecessor( start_nodes, samples[layer_idx], return_eids=True) start = time.time() last_nodes_set = nodes_set nodes, eids = copy.copy(nodes), copy.copy(eids) edges = copy.copy(edges) nodes_set, eids_set = set(nodes), set(eids) for srcs, dst, pred_eids in zip(batch_pred_nodes, start_nodes, batch_pred_eids): for src, eid in zip(srcs, pred_eids): if edge_hash(src, dst) in ignore_edge_set: continue if eid not in eids_set: eids.append(eid) edges.append([src, dst]) eids_set.add(eid) if src not in nodes_set: nodes.append(src) nodes_set.add(src) layer_edges = [edges] + layer_edges start_nodes = list(nodes_set - last_nodes_set) layer_nodes = [nodes] + layer_nodes layer_eids = [eids] + layer_eids start = time.time() # Find new nodes from_reindex = {x: i for i, x in enumerate(layer_nodes[0])} node_index = graph_kernel.map_nodes(node_index, from_reindex) sample_index = np.array(layer_nodes[0], dtype="int64") graph_list = [] for i in range(num_layers): sg = subgraph( graph, nodes=layer_nodes[0], eid=layer_eids[i], edges=layer_edges[i]) graph_list.append((sg, sample_index, node_index)) return graph_list