Word Generator#
Scope#
This notebook was inspired by the great video proposed by David Louapre available on his Youtube channel “Science Etonnante”.
https://sciencetonnante.wordpress.com/2015/11/06/la-machine-a-inventer-des-mots-version-ikea/
Here the word generator is embedded in a class.
Show code cell source
# Setup
%load_ext autoreload
%matplotlib ipympl
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import IPython, io, urllib
import codecs
import re
from numpy.random import choice, seed
seed(1)
The video#
IPython.display.YouTubeVideo("YsR7r2378j0")
The Class that manage the word generator#
class word_generator:
def __init__(self, dic_file):
# Input file containing one word per line, and its encoding
# Assumes one word per line but if the the words are followed by
# a space, a tab, a slash, a comma, etc....the end of the line will be trimmed
self.dic_file = dic_file
self.encoding = "ISO-8859-1"
# Name of the output binary matrix, matrix image file and output txt file
count_file = r"count_FR.bin"
proba_matrix = r"matrix_FR.png"
self.outfile = r"output.txt"
self.read_dic()
def read_dic(self):
self.dico = [] # to store the words of the dictionnary
self.count = np.zeros((256, 256, 256), dtype="int32")
with codecs.open(self.dic_file, "r", self.encoding) as lines:
for l in lines:
# Trimming of the line :
# Split on white space, tab, slash backslah or open parenthesis
# and keep the first string, add EOL character
l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
self.dico.append(l2[:-1])
i, j = 0, 0
for k in [ord(c) for c in list(l2)]:
self.count[i, j, k] += 1
i = j
j = k
def plot(self):
count2D = self.count.sum(axis=0)
p2D = count2D.astype("float") / np.tile(sum(count2D.T), (256, 1)).T
p2D[np.isnan(p2D)] = 0
# For better contrast, we plot p^alpha instead of p
alpha = 0.33
p2Da = p2D**alpha
self.p2Da = p2Da[97:123, 97:123]
# We display only letters a to z, ie ASCII from 97 to 123.
plt.figure(figsize=(8, 8))
gr = plt.imshow(self.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
plt.axis("off")
cbar = plt.colorbar(gr, orientation="vertical")
for i in range(97, 123):
plt.text(
-1,
i - 97,
chr(i),
horizontalalignment="center",
verticalalignment="center",
)
plt.text(
i - 97,
-1,
chr(i),
horizontalalignment="center",
verticalalignment="center",
)
def __call__(self):
# For the random generator : what is the minimum and maximum number of letters
# in the words that we want to generate, and how many words for each length
smin, smax = 4, 12
K = 5
# Compute the probabilities by normalizing the counts
s = self.count.sum(axis=2)
st = np.tile(s.T, (256, 1, 1)).T
p = self.count.astype("float") / st
p[np.isnan(p)] = 0
f = codecs.open(self.outfile, "w", self.encoding)
# Generate words
for size in range(smin, smax + 1):
total = 0
while total < K:
i, j = 0, 0
res = ""
while not j == 10:
k = choice(range(256), 1, p=p[i, j, :])[0]
res = res + chr(k)
i, j = j, k
if len(res) == 1 + size:
x = res[:-1]
if res[:-1] in self.dico:
x = res[:-1] + "*"
total += 1
print(x)
f.write(x + "\n")
f.close()
French#
gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
gen_FR.plot()
plt.show()
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[4], line 1
----> 1 gen_FR = word_generator(r"_DATA/dic/FR_aisi.txt")
2 gen_FR.plot()
3 plt.show()
Cell In[3], line 14, in word_generator.__init__(self, dic_file)
11 proba_matrix = r"matrix_FR.png"
12 self.outfile = r"output.txt"
---> 14 self.read_dic()
Cell In[3], line 20, in word_generator.read_dic(self)
17 self.dico = [] # to store the words of the dictionnary
19 self.count = np.zeros((256, 256, 256), dtype="int32")
---> 20 with codecs.open(self.dic_file, "r", self.encoding) as lines:
21 for l in lines:
22 # Trimming of the line :
23 # Split on white space, tab, slash backslah or open parenthesis
24 # and keep the first string, add EOL character
25 l2 = re.split("[ /\\\t,\(]", l)[0] + "\n"
File /opt/conda/envs/science/lib/python3.10/codecs.py:906, in open(filename, mode, encoding, errors, buffering)
902 if encoding is not None and \
903 'b' not in mode:
904 # Force opening of the file in binary mode
905 mode = mode + 'b'
--> 906 file = builtins.open(filename, mode, buffering)
907 if encoding is None:
908 return file
FileNotFoundError: [Errno 2] No such file or directory: '_DATA/dic/FR_aisi.txt'
gen_FR()
Swedish#
gen_SE = word_generator(r"_DATA/dic/SE_aisi.txt")
gen_SE.plot()
gen_SE()
Compare french and swedisch#
fig = plt.figure()
ax = fig.add_subplot(1, 2, 1)
ax.imshow(gen_FR.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
plt.text(
-1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.text(
i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.title("French")
ax = fig.add_subplot(1, 2, 2)
ax.imshow(gen_SE.p2Da, interpolation="nearest", cmap=mpl.cm.OrRd)
ax.axis("off")
for i in range(97, 123):
plt.text(
-1, i - 97, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.text(
i - 97, -1, chr(i), horizontalalignment="center", verticalalignment="center"
)
plt.title("Swedisch")
plt.show()