Python: simple list merging based on intersections
My attempt:
def merge(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = True
while merged:
merged = False
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = True
common |= x
results.append(common)
sets = results
return sets
lst = [[65, 17, 5, 30, 79, 56, 48, 62],
[6, 97, 32, 93, 55, 14, 70, 32],
[75, 37, 83, 34, 9, 19, 14, 64],
[43, 71],
[],
[89, 49, 1, 30, 28, 3, 63],
[35, 21, 68, 94, 57, 94, 9, 3],
[16],
[29, 9, 97, 43],
[17, 63, 24]]
print merge(lst)
Benchmark:
import random
# adapt parameters to your own usage scenario
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
if False: # change to true to generate the test data file (takes a while)
with open("/tmp/test.txt", "w") as f:
lists = []
classes = [
range(class_size * i, class_size * (i + 1)) for i in range(class_count)
]
for c in classes:
# distribute each class across ~300 lists
for i in xrange(list_count_per_class):
lst = []
if random.random() < large_list_probability:
size = random.choice(large_list_sizes)
else:
size = random.choice(small_list_sizes)
nums = set(c)
for j in xrange(size):
x = random.choice(list(nums))
lst.append(x)
nums.remove(x)
random.shuffle(lst)
lists.append(lst)
random.shuffle(lists)
for lst in lists:
f.write(" ".join(str(x) for x in lst) + "\n")
setup = """
# Niklas'
def merge_niklas(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = 1
while merged:
merged = 0
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = 1
common |= x
results.append(common)
sets = results
return sets
# Rik's
def merge_rik(data):
sets = (set(e) for e in data if e)
results = [next(sets)]
for e_set in sets:
to_update = []
for i, res in enumerate(results):
if not e_set.isdisjoint(res):
to_update.insert(0, i)
if not to_update:
results.append(e_set)
else:
last = results[to_update.pop(-1)]
for i in to_update:
last |= results[i]
del results[i]
last |= e_set
return results
# katrielalex's
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
import networkx
def merge_katrielalex(lsts):
g = networkx.Graph()
for lst in lsts:
for edge in pairs(lst):
g.add_edge(*edge)
return networkx.connected_components(g)
# agf's (optimized)
from collections import deque
def merge_agf_optimized(lists):
sets = deque(set(lst) for lst in lists if lst)
results = []
disjoint = 0
current = sets.pop()
while True:
merged = False
newsets = deque()
for _ in xrange(disjoint, len(sets)):
this = sets.pop()
if not current.isdisjoint(this):
current.update(this)
merged = True
disjoint = 0
else:
newsets.append(this)
disjoint += 1
if sets:
newsets.extendleft(sets)
if not merged:
results.append(current)
try:
current = newsets.pop()
except IndexError:
break
disjoint = 0
sets = newsets
return results
# agf's (simple)
def merge_agf_simple(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
# alexis'
def merge_alexis(data):
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [m for m in data if m]
return have
def locatebin(bins, n):
while bins[n] != n:
n = bins[n]
return n
lsts = []
size = 0
num = 0
max = 0
for line in open("/tmp/test.txt", "r"):
lst = [int(x) for x in line.split()]
size += len(lst)
if len(lst) > max:
max = len(lst)
num += 1
lsts.append(lst)
"""
setup += """
print "%i lists, {class_count} equally distributed classes, average size %i, max size %i" % (num, size/num, max)
""".format(class_count=class_count)
import timeit
print "niklas"
print timeit.timeit("merge_niklas(lsts)", setup=setup, number=3)
print "rik"
print timeit.timeit("merge_rik(lsts)", setup=setup, number=3)
print "katrielalex"
print timeit.timeit("merge_katrielalex(lsts)", setup=setup, number=3)
print "agf (1)"
print timeit.timeit("merge_agf_optimized(lsts)", setup=setup, number=3)
print "agf (2)"
print timeit.timeit("merge_agf_simple(lsts)", setup=setup, number=3)
print "alexis"
print timeit.timeit("merge_alexis(lsts)", setup=setup, number=3)
These timings are obviously dependent on the specific parameters to the benchmark, like number of classes, number of lists, list size, etc. Adapt those parameters to your need to get more helpful results.
Below are some example outputs on my machine for different parameters. They show that all the algorithms have their strength and weaknesses, depending on the kind of input they get:
=====================
# many disjoint classes, large lists
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
=====================
niklas
5000 lists, 50 equally distributed classes, average size 298, max size 999
4.80084705353
rik
5000 lists, 50 equally distributed classes, average size 298, max size 999
9.49251699448
katrielalex
5000 lists, 50 equally distributed classes, average size 298, max size 999
21.5317108631
agf (1)
5000 lists, 50 equally distributed classes, average size 298, max size 999
8.61671280861
agf (2)
5000 lists, 50 equally distributed classes, average size 298, max size 999
5.18117713928
=> alexis
=> 5000 lists, 50 equally distributed classes, average size 298, max size 999
=> 3.73504281044
===================
# less number of classes, large lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
===================
niklas
4500 lists, 15 equally distributed classes, average size 296, max size 999
1.79993700981
rik
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.58237695694
katrielalex
4500 lists, 15 equally distributed classes, average size 296, max size 999
19.5465381145
agf (1)
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.75445604324
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 296, max size 999
=> 1.77850699425
alexis
4500 lists, 15 equally distributed classes, average size 296, max size 999
3.23530197144
===================
# less number of classes, smaller lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.1
===================
niklas
4500 lists, 15 equally distributed classes, average size 95, max size 997
0.773697137833
rik
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.0523750782
katrielalex
4500 lists, 15 equally distributed classes, average size 95, max size 997
6.04466891289
agf (1)
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.20285701752
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 95, max size 997
=> 0.714507102966
alexis
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.1286110878
I tried to summurize everything that's been said and done about this topic in this question and in the duplicate one.
I tried to test and time every solution (all the code here).
Testing
This is the TestCase
from the testing module:
class MergeTestCase(unittest.TestCase):
def setUp(self):
with open('./lists/test_list.txt') as f:
self.lsts = json.loads(f.read())
self.merged = self.merge_func(deepcopy(self.lsts))
def test_disjoint(self):
"""Check disjoint-ness of merged results"""
from itertools import combinations
for a,b in combinations(self.merged, 2):
self.assertTrue(a.isdisjoint(b))
def test_coverage(self): # Credit to katrielalex
"""Check coverage original data"""
merged_flat = set()
for s in self.merged:
merged_flat |= s
original_flat = set()
for lst in self.lsts:
original_flat |= set(lst)
self.assertTrue(merged_flat == original_flat)
def test_subset(self): # Credit to WolframH
"""Check that every original data is a subset"""
for lst in self.lsts:
self.assertTrue(any(set(lst) <= e for e in self.merged))
This test is supposing a list of sets as result, so I couldn't test a couple of sulutions that worked with lists.
I couldn't test the following:
katrielalex
steabert
Among the ones I could test, two failed:
-- Going to test: agf (optimized) --
Check disjoint-ness of merged results ... FAIL
-- Going to test: robert king --
Check disjoint-ness of merged results ... FAIL
Timing
The performances are strongly related with the data test employed.
So far three answers tried to time theirs and others solution. Since they used different testing data they had different results.
-
Niklas benchmark is very twakable. With his banchmark one could do different tests changing some parameters.
I've used the same three sets of parameters he used in his own answer, and I put them in three different files:
filename = './lists/timing_1.txt' class_count = 50, class_size = 1000, list_count_per_class = 100, large_list_sizes = (100, 1000), small_list_sizes = (0, 100), large_list_probability = 0.5, filename = './lists/timing_2.txt' class_count = 15, class_size = 1000, list_count_per_class = 300, large_list_sizes = (100, 1000), small_list_sizes = (0, 100), large_list_probability = 0.5, filename = './lists/timing_3.txt' class_count = 15, class_size = 1000, list_count_per_class = 300, large_list_sizes = (100, 1000), small_list_sizes = (0, 100), large_list_probability = 0.1,
This are the results that I got:
From file:
timing_1.txt
Timing with: >> Niklas << Benchmark Info: 5000 lists, average size 305, max size 999 Timing Results: 10.434 -- alexis 11.476 -- agf 11.555 -- Niklas B. 13.622 -- Rik. Poggi 14.016 -- agf (optimized) 14.057 -- ChessMaster 20.208 -- katrielalex 21.697 -- steabert 25.101 -- robert king 76.870 -- Sven Marnach 133.399 -- hochl
From file:
timing_2.txt
Timing with: >> Niklas << Benchmark Info: 4500 lists, average size 305, max size 999 Timing Results: 8.247 -- Niklas B. 8.286 -- agf 8.637 -- Rik. Poggi 8.967 -- alexis 9.090 -- ChessMaster 9.091 -- agf (optimized) 18.186 -- katrielalex 19.543 -- steabert 22.852 -- robert king 70.486 -- Sven Marnach 104.405 -- hochl
From file:
timing_3.txt
Timing with: >> Niklas << Benchmark Info: 4500 lists, average size 98, max size 999 Timing Results: 2.746 -- agf 2.850 -- Niklas B. 2.887 -- Rik. Poggi 2.972 -- alexis 3.077 -- ChessMaster 3.174 -- agf (optimized) 5.811 -- katrielalex 7.208 -- robert king 9.193 -- steabert 23.536 -- Sven Marnach 37.436 -- hochl
-
With Sven's testing data I got the following results:
Timing with: >> Sven << Benchmark Info: 200 lists, average size 10, max size 10 Timing Results: 2.053 -- alexis 2.199 -- ChessMaster 2.410 -- agf (optimized) 3.394 -- agf 3.398 -- Rik. Poggi 3.640 -- robert king 3.719 -- steabert 3.776 -- Niklas B. 3.888 -- hochl 4.610 -- Sven Marnach 5.018 -- katrielalex
-
And finally with Agf's benchmark I got:
Timing with: >> Agf << Benchmark Info: 2000 lists, average size 246, max size 500 Timing Results: 3.446 -- Rik. Poggi 3.500 -- ChessMaster 3.520 -- agf (optimized) 3.527 -- Niklas B. 3.527 -- agf 3.902 -- hochl 5.080 -- alexis 15.997 -- steabert 16.422 -- katrielalex 18.317 -- robert king 1257.152 -- Sven Marnach
As I said at the beginning all the code is available at this git repository. All the merging functions are in a file called core.py
, every function there with its name ending with _merge
will be auto loaded during the tests, so it shouldn't be hard to add/test/improve your own solution.
Let me also know if there's something wrong, it's been a lot of coding and I could use a couple of fresh eyes :)
Using Matrix Manipulations
Let me preface this answer with the following comment:
THIS IS THE WRONG WAY TO DO THIS. IT IS PRONE TO NUMERICAL INSTABILITY AND IS MUCH SLOWER THAN THE OTHER METHODS PRESENTED, USE AT YOUR OWN RISK.
That being said, I couldn't resist solving the problem from a dynamical point of view (and I hope you'll get a fresh perspective on the problem). In theory this should work all the time, but eigenvalue calculations can often fail. The idea is to think of your list as a flow from rows to columns. If two rows share a common value there is a connecting flow between them. If we were to think of these flows as water, we would see that the flows cluster into little pools when they there is a connecting path between them. For simplicity, I'm going to use a smaller set, though it works with your data set as well:
from numpy import where, newaxis
from scipy import linalg, array, zeros
X = [[0,1,3],[2],[3,1]]
We need to convert the data into a flow graph. If row i flows into value j we put it in the matrix. Here we have 3 rows and 4 unique values:
A = zeros((4,len(X)), dtype=float)
for i,row in enumerate(X):
for val in row: A[val,i] = 1
In general, you'll need to change the 4
to capture the number of unique values you have. If the set is a list of integers starting from 0 as we have, you can simply make this the largest number. We now perform an eigenvalue decomposition. A SVD to be exact, since our matrix is not square.
S = linalg.svd(A)
We want to keep only the 3x3 portion of this answer, since it will represent the flow of the pools. In fact we only want the absolute values of this matrix; we only care if there is a flow in this cluster space.
M = abs(S[2])
We can think of this matrix M as a Markov matrix and make it explicit by row normalizing. Once we have this we compute the (left) eigenvalue decomp. of this matrix.
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
V = abs(V)
Now a disconnected (non-ergodic) Markov matrix has the nice property that, for each non-connected cluster, there is a eigenvalue of unity. The eigenvectors associated with these unity values are the ones we want:
idx = where(U > .999)[0]
C = V.T[idx] > 0
I have to use .999 due to the aforementioned numerical instability. At this point, we are done! Each independent cluster can now pull the corresponding rows out:
for cluster in C:
print where(A[:,cluster].sum(axis=1))[0]
Which gives, as intended:
[0 1 3]
[2]
Change X
to your lst
and you'll get: [ 0 1 3 4 5 10 11 16] [2 8]
.
Addendum
Why might this be useful? I don't know where your underlying data comes from, but what happens when the connections are not absolute? Say row 1
has entry 3
80% of the time - how would you generalize the problem? The flow method above would work just fine, and would be completely parametrized by that .999
value, the further away from unity it is, the looser the association.
Visual Representation
Since a picture is worth 1K words, here are the plots of the matrices A and V for my example and your lst
respectively. Notice how in V
splits into two clusters (it is a block-diagonal matrix with two blocks after permutation), since for each example there were only two unique lists!
Faster Implementation
In hindsight, I realized that you can skip the SVD step and compute only a single decomp:
M = dot(A.T,A)
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
The advantage with this method (besides speed) is that M
is now symmetric, hence the computation can be faster and more accurate (no imaginary values to worry about).
EDIT: OK, the other questions has been closed, posting here.
Nice question! It's much simpler if you think of it as a connected-components problem in a graph. The following code uses the excellent networkx
graph library and the pairs
function from this question.
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
import networkx
g = networkx.Graph()
for sub_list in lists:
for edge in pairs(sub_list):
g.add_edge(*edge)
networkx.connected_components(g)
[[1, 2, 3, 5, 6], [8, 9, 10], [11, 12, 13]]
Explanation
We create a new (empty) graph g
. For each sub-list in lists
, consider its elements as nodes of the graph and add an edge between them. (Since we only care about connectedness, we don't need to add all the edges -- only adjacent ones!) Note that add_edge
takes two objects, treats them as nodes (and adds them if they aren't already there), and adds an edge between them.
Then, we just find the connected components of the graph -- a solved problem! -- and output them as our intersecting sets.
Here's my answer. I haven't checked it against today's batch of answers.
The intersection-based algorithms are O(N^2) since they check each new set against all the existing ones, so I used an approach that indexes each number and runs on close to O(N) (if we accept that dictionary lookups are O(1)). Then I ran the benchmarks and felt like a complete idiot because it ran slower, but on closer inspection it turned out that the test data ends up with only a handful of distinct result sets, so the quadratic algorithms don't have a lot work to do. Test it with more than 10-15 distinct bins and my algorithm is much faster. Try test data with more than 50 distinct bins, and it is enormously faster.
(Edit: There was also a problem with the way the benchmark is run, but I was wrong in my diagnosis. I altered my code to work with the way the repeated tests are run).
def mergelists5(data):
"""Check each number in our arrays only once, merging when we find
a number we have seen before.
"""
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data ] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [ m for m in data if m ]
print len(have), "groups in result"
return have
def locatebin(bins, n):
"""
Find the bin where list n has ended up: Follow bin references until
we find a bin that has not moved.
"""
while bins[n] != n:
n = bins[n]
return n