Word Generator

Contents

Word Generator#

Scope#

This notebook was inspired by the great video proposed by David Louapre available on his Youtube channel “Science Etonnante”.

https://sciencetonnante.wordpress.com/2015/11/06/la-machine-a-inventer-des-mots-version-ikea/

Here the word generator is embedded in a class.

The video#

IPython.display.YouTubeVideo("YsR7r2378j0")

The Class that manage the word generator#

class word_generator:
    def __init__(self, dic_file):
        # Input file containing one word per line, and its encoding
        # Assumes one word per line but if the the words are followed by
        # a space, a tab, a slash, a comma, etc....the end of the line will be trimmed
        self.dic_file = dic_file
        self.encoding = "ISO-8859-1"

        # Name of the output binary matrix, matrix image file and output txt file
        count_file = r"count_FR.bin"
        proba_matrix = r"matrix_FR.png"
        self.outfile = r"output.txt"

        self.read_dic()

    def read_dic(self):
        self.dico = []  # to store the words of the dictionnary

        self.count = np.zeros((256, 256, 256), dtype="int32")
        with codecs.open(self.dic_file, "r", self.encoding) as lines:
            for l in lines:
                # Trimming of the line :
                # Split on white space, tab, slash backslah or open parenthesis
                # and keep the first string, add EOL character
                l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
                self.dico.append(l2[:-1])
                i, j = 0, 0
                for k in [ord(c) for c in list(l2)]:
                    self.count[i, j, k] += 1
                    i = j
                    j = k

    def plot(self):
        count2D = self.count.sum(axis=0)
        p2D = count2D.astype("float") / np.tile(sum(count2D.T), (256, 1)).T
        p2D[np.isnan(p2D)] = 0

        # For better contrast, we plot p^alpha instead of p
        alpha = 0.33
        p2Da = p2D**alpha
        self.p2Da = p2Da[97:123, 97:123]

        # We display only letters a to z, ie ASCII from 97 to 123.
        plt.figure(figsize=(8, 8))
        gr = plt.imshow(self.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
        plt.axis("off")
        cbar = plt.colorbar(gr, orientation="vertical")

        for i in range(97, 123):
            plt.text(
                -1,
                i - 97,
                chr(i),
                horizontalalignment="center",
                verticalalignment="center",
            )
            plt.text(
                i - 97,
                -1,
                chr(i),
                horizontalalignment="center",
                verticalalignment="center",
            )

    def __call__(self):
        # For the random generator : what is the minimum and maximum number of letters
        # in the words that we want to generate, and how many words for each length
        smin, smax = 4, 12
        K = 5

        # Compute the probabilities by normalizing the counts
        s = self.count.sum(axis=2)
        st = np.tile(s.T, (256, 1, 1)).T
        p = self.count.astype("float") / st
        p[np.isnan(p)] = 0

        f = codecs.open(self.outfile, "w", self.encoding)
        # Generate words
        for size in range(smin, smax + 1):
            total = 0
            while total < K:
                i, j = 0, 0
                res = ""
                while not j == 10:
                    k = choice(range(256), 1, p=p[i, j, :])[0]
                    res = res + chr(k)
                    i, j = j, k
                if len(res) == 1 + size:
                    x = res[:-1]
                    if res[:-1] in self.dico:
                        x = res[:-1] + "*"
                    total += 1
                    print(x)
                    f.write(x + "\n")
        f.close()

French#

gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
gen_FR.plot()
plt.show()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[4], line 1
----> 1 gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
      2 gen_FR.plot()
      3 plt.show()

Cell In[3], line 14, in word_generator.__init__(self, dic_file)
     11 proba_matrix = r"matrix_FR.png"
     12 self.outfile = r"output.txt"
---> 14 self.read_dic()

Cell In[3], line 20, in word_generator.read_dic(self)
     17 self.dico = []  # to store the words of the dictionnary
     19 self.count = np.zeros((256, 256, 256), dtype="int32")
---> 20 with codecs.open(self.dic_file, "r", self.encoding) as lines:
     21     for l in lines:
     22         # Trimming of the line :
     23         # Split on white space, tab, slash backslah or open parenthesis
     24         # and keep the first string, add EOL character
     25         l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"

File /opt/conda/envs/science/lib/python3.10/codecs.py:906, in open(filename, mode, encoding, errors, buffering)
    902 if encoding is not None and \
    903    'b' not in mode:
    904     # Force opening of the file in binary mode
    905     mode = mode + 'b'
--> 906 file = builtins.open(filename, mode, buffering)
    907 if encoding is None:
    908     return file

FileNotFoundError: [Errno 2] No such file or directory: '_DATA/dic/FR_aisi.txt'

gen_FR()

Swedish#

gen_SE = word_generator(r"_DATA/dic/SE_aisi.txt")
gen_SE.plot()

gen_SE()

Compare french and swedisch#

fig = plt.figure()
ax = fig.add_subplot(1, 2, 1)
ax.imshow(gen_FR.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
    plt.text(
        -1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
    )
    plt.text(
        i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
    )
plt.title("French")

ax = fig.add_subplot(1, 2, 2)
ax.imshow(gen_SE.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
    plt.text(
        -1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
    )
    plt.text(
        i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
    )
plt.title("Swedisch")
plt.show()