Finding cycles in directed graphs using TSQL

I recently noticed that the Create Data Diagram function of dbscript creates rather distorted diagrams if there are cycles of more than 2 tables in table relations. However, the algorithm handles pig’s ears and reciprocal relations correctly.

So the question was, how do I detect cycles in table relations?

Regarding a directed graph, tables are equivalent to nodes, foreign key relations are equivalent to directed edges, thus the question can be rephrased as:

How do I detect cycles in a directed graph, given the graph is stored in 2 tables with this structure:

CREATE TABLE #Node (
	ID INT PRIMARY KEY,
	Name NVARCHAR(128)
)

CREATE TABLE #Edge (
	ID INT IDENTITY(1,1) PRIMARY KEY,
	NodeIDFrom INT,
	NodeIDTo INT
)

Let’s populate the tables with the following values:

INSERT INTO #Node (ID, Name) VALUES (1, 'A')
INSERT INTO #Node (ID, Name) VALUES (2, 'B1')
INSERT INTO #Node (ID, Name) VALUES (3, 'B2')
INSERT INTO #Node (ID, Name) VALUES (4, 'C1')
INSERT INTO #Node (ID, Name) VALUES (5, 'C2')
INSERT INTO #Node (ID, Name) VALUES (6, 'C3')
INSERT INTO #Node (ID, Name) VALUES (7, 'D')
INSERT INTO #Node (ID, Name) VALUES (8, 'E')

-- Bx loop
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (2, 3)
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (3, 2)
-- Cx loop
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (4, 5)
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (5, 6)
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (6, 4)
-- D, E, C1, B1
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (7, 8 )
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (8, 4)
INSERT INTO #Edge(NodeIDFrom, NodeIDTo) VALUES (4, 2)

i.e. node A is not connected to any other, the Bx nodes form a cycle (length 2), the Cx nodes form a cycle (length 3), and there is a path D-E-C1-B1.

This algorithm performs the following operations:

In table #Graph, create a list of all paths starting from all nodes, until the most recently reached nodes match the starting nodes. List creation ends when no new edges can be added to the list of paths (@@ROWCOUNT = 0):

CREATE TABLE #Graph(
	ID INT IDENTITY(1,1) NOT NULL PRIMARY KEY,
	Seq INT,
	NodeID INT,
	EdgeID INT,
	NodeIDNext INT,
	NodeIDRoot int,
	IsCycle BIT DEFAULT (0)
)

INSERT INTO #Graph (Seq, NodeID, NodeIDRoot, EdgeID, NodeIDNext)
SELECT DISTINCT 1, #Node.ID, #Node.ID, #Edge.ID, #Edge.NodeIDTo
FROM #Node
INNER JOIN #Edge ON #Node.ID = #Edge.NodeIDFrom

PRINT @@ROWCOUNT

DECLARE @Seq INT, @RC INT
SET @Seq = 1
SET @RC = 1

WHILE @RC > 0 BEGIN

	UPDATE 	#Graph
	SET	IsCycle = 1
	WHERE	NodeIDRoot = NodeIDNext
	AND	Seq = @Seq

	INSERT INTO #Graph (Seq, NodeID, NodeIDRoot, EdgeID, NodeIDNext)
	SELECT DISTINCT @Seq + 1, #Edge.NodeIDFrom, #Graph.NodeIDRoot, 
		#Edge.ID, #Edge.NodeIDTo
	FROM 	#Graph
	INNER JOIN #Edge ON #Graph.NodeIDNext = #Edge.NodeIDFrom
	LEFT OUTER JOIN #Graph ex ON ex.NodeIDRoot = #Graph.NodeIDRoot 
		AND ex.EdgeID = #Edge.ID
	WHERE 	#Graph.IsCycle = 0
	AND 	#Graph.Seq = @Seq
	AND 	ex.ID IS NULL

	SET @RC = @@ROWCOUNT

	PRINT CONVERT(VARCHAR, @Seq) + ': ' + CONVERT(VARCHAR, @RC)
	SET @Seq = @Seq + 1
END

SELECT 	#Graph.NodeIDRoot, #Graph.Seq, #Graph.NodeID, 
		#Graph.NodeIDNext, #Graph.EdgeID, #Node.Name, Node2.Name
FROM 	#Graph
INNER JOIN #Node ON #Graph.NodeID = #Node.ID
INNER JOIN #Node Node2 ON #Graph.NodeIDNext = Node2.ID
WHERE 	IsCycle = 1
ORDER BY #Graph.NodeIDRoot, #Graph.Seq

The IsCycle attribute marks paths that reached its starting node (NodeIDRoot = NodeIDNext).

Next, we collect all final steps that led to cycle detection:

CREATE TABLE #Cycles(
	ID INT IDENTITY(1,1) PRIMARY KEY,
	CycleID INT,
	NodeIDRoot INT,
	Seq INT,
	NodeID INT,
	NodeIDNext INT,
	NodeName NVARCHAR(128),
	EdgeID INT
)

INSERT INTO #Cycles (NodeIDRoot, Seq, NodeID, NodeIDNext, NodeName, EdgeID)
SELECT NodeIDRoot, Seq, NodeID, NodeIDNext, #Node.Name, EdgeID
FROM 	#Graph
INNER JOIN #Node ON #Graph.NodeIDNext = #Node.ID
WHERE 	IsCycle = 1
ORDER BY #Graph.NodeIDRoot, #Graph.Seq

UPDATE 	#Cycles SET CycleID = ID

SELECT * FROM #Cycles
ORDER BY CycleID, Seq

SET @RC = @@ROWCOUNT

From this information we reconstruct the path taken from the starting nodes to the end nodes:

WHILE @RC > 0 BEGIN
	INSERT INTO #Cycles (CycleID, NodeIDRoot, Seq, NodeID, NodeIDNext, NodeName, EdgeID)
	SELECT DISTINCT #Cycles.CycleID, #Cycles.NodeIDRoot, #Graph.Seq, 
		#Graph.NodeID, #Graph.NodeIDNext, #Node.Name, #Graph.EdgeID
	FROM #Cycles
	INNER JOIN #Graph ON #Cycles.NodeIDRoot = #Graph.NodeIDRoot
		AND #Cycles.Seq -1 = #Graph.Seq
		AND #Cycles.NodeID = #Graph.NodeIDNext
	INNER JOIN #Node ON #Graph.NodeIDNext = #Node.ID
	LEFT OUTER JOIN #Cycles ex ON #Cycles.CycleID = ex.CycleID
		AND ex.NodeIDRoot = #Cycles.NodeIDRoot
		AND ex.Seq = #Graph.Seq
		AND ex.NodeID = #Graph.NodeID
		AND ex.NodeIDNext = #Graph.NodeIDNext
		AND ex.EdgeID = #Graph.EdgeID
	WHERE ex.ID IS NULL
	ORDER BY #Cycles.NodeIDRoot, #Graph.Seq

	SET @RC = @@ROWCOUNT
END

SELECT * FROM #Cycles
ORDER BY CycleID, Seq

The last query lists the cycles:
Cycle 1: B2 – B1
Cycle 2: B1 – B2
Cycle 3: C2 – C3 – C1
Cycle 4: C3 – C1 – C2
Cycle 5: C1 – C2 – C3

Finally, we clean up the temporary tables

DROP TABLE #Edge
DROP TABLE #Node
DROP TABLE #Graph
DROP TABLE #Cycles

2 thoughts on “Finding cycles in directed graphs using TSQL

  1. Pingback: dbscript New Version 0.99 « devioblog

Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.