Examples
Install:
$ pip install super-collator
Align two strings with relaxed spelling using N-Grams:
>>> from super_collator.aligner import Aligner
>>> from super_collator.ngrams import NGrams
>>> from super_collator.super_collator import to_table
>>> aligner = Aligner(-0.5, -0.5, -0.5)
>>> a = "Lorem ipsum dollar amat adipiscing elit"
>>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>>
>>> a = [NGrams(s).load(s, 3) for s in a.split()]
>>> b = [NGrams(s).load(s, 3) for s in b.split()]
>>>
>>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-"))
>>> print(
... to_table(list(map(str, a)), list(map(str, b)))
... )
- Lorem ipsum - dollar - amat - adipiscing elit
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
Multiple alignment: We repeatedly align two lists of NGrams against each other.
>>> from super_collator.aligner import Aligner
>>> from super_collator.ngrams import NGrams
>>> from super_collator.super_collator import to_table
>>>
>>> def similarity(aa, bb):
... sim = float("-inf")
... for a in aa:
... for b in bb:
... score = NGrams.similarity(a, b)
... if score > sim:
... sim = score
... return sim
...
>>>
>>> def merge(aa, bb):
... return [a + b for a, b in zip(aa, bb)]
...
>>>
>>> aligner = Aligner(-1.0, -0.5, -0.5)
>>> a = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>> b = "Lorem ipsum adipiscing"
>>> c = "Lorem dollar amat elit"
>>>
>>> a = [[NGrams(s).load(s, 2)] for s in a.split()]
>>> b = [[NGrams(s).load(s, 2)] for s in b.split()]
>>> c = [[NGrams(s).load(s, 2)] for s in c.split()]
>>>
>>> a, b, score = aligner.align(
... a, b, similarity, lambda: [NGrams("-")], lambda: [NGrams("-")]
... )
>>> ab = merge(a, b)
>>> ab, c, score = aligner.align(
... ab, c, similarity, lambda: [NGrams("-")] * 2, lambda: [NGrams("-")]
... )
>>> abc = merge(ab, c)
>>>
>>> print(
... to_table(*zip(*[[t.user_data for t in nn] for nn in abc]))
... )
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
- Lorem ipsum - - - - - adipiscing -
- Lorem - - dollar - amat - - elit
Align two sentences using their part-of-speech tags only:
>>> from super_collator.aligner import Aligner
>>> from super_collator.super_collator import to_table
>>>
>>> class PosToken:
... def __init__(self, s, pos):
... self.s = s
... self.pos = pos
...
... def __str__(self):
... return self.s
...
... @staticmethod
... def similarity(a, b):
... return 1.0 if a.pos == b.pos else 0.0
...
>>>
>>> aligner = Aligner()
>>> a = "it/PRP was/VBD a/DT dark/JJ and/CC stormy/JJ night/NN"
>>> b = "it/PRP is/VBZ a/DT fine/JJ day/NN"
>>>
>>> a = [PosToken(*s.split("/")) for s in a.split()]
>>> b = [PosToken(*s.split("/")) for s in b.split()]
>>>
>>> c, d, score = aligner.align(a, b, PosToken.similarity, lambda: PosToken("-", ""))
>>> print(
... to_table(list(map(str, c)), list(map(str, d)))
... )
it was a dark and stormy night
it is a fine - - day