From ec8c016767ae57a74a40a1cf3e56566c72d41272 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Sun, 30 Jul 2017 04:59:25 +0530 Subject: [PATCH 01/24] added download and catalogue functions --- gensim/__main__.py | 31 +++++++++++++++++++++++++++++++ gensim/api/__init__.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 gensim/__main__.py create mode 100644 gensim/api/__init__.py diff --git a/gensim/__main__.py b/gensim/__main__.py new file mode 100644 index 0000000000..b07862288b --- /dev/null +++ b/gensim/__main__.py @@ -0,0 +1,31 @@ +from __future__ import print_function +from __future__ import absolute_import +import sys +import argparse +from .api import download +from .api import catalogue +if __name__ == '__main__': + parser = argparse.ArgumentParser(description = "Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d" ,"--download", nargs = 1,help = "To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-l", "--link", nargs = 2,help=" To store a shortcut to a corpus/model : python -m gensim -l source destination") + group.add_argument("-i", "--info",nargs = 1, help = "To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c","--catalogue", help ="To get the list of all models/corpus stored : python -m gensim -c",action="store_true") + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + + elif sys.argv[1] == "-l" or sys.argv[1] =="--link": + link(sys.argv[2],sys.argv[3]) + + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() + + + + + + diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py new file mode 100644 index 0000000000..7f4ff7ed4b --- /dev/null +++ b/gensim/api/__init__.py @@ -0,0 +1,29 @@ +import subprocess +import json +import sys +import os +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen +def download(file): + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" + data = catalogue() + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file not in corpuses and file not in models: + print("Incorrect corpus/model name.") + else: + url = url+file+"/"+file+".tar.gz" + print("Downloading {m}".format(m=file)) + subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) + +def catalogue(print_list = 0): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + if print_list == 1 : + print(json.loads(data)) + else: + return json.loads(data) + From 636bfffb82233564e7f6fe6febf4ed97d0f713c6 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 31 Jul 2017 21:17:58 +0530 Subject: [PATCH 02/24] added link and info --- gensim/__main__.py | 2 ++ gensim/api/__init__.py | 51 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index b07862288b..8f82c8089a 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -4,6 +4,8 @@ import argparse from .api import download from .api import catalogue +from .api import link +from .api import info if __name__ == '__main__': parser = argparse.ArgumentParser(description = "Gensim console API") group = parser.add_mutually_exclusive_group() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 7f4ff7ed4b..d0e1e5ea2b 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -2,13 +2,17 @@ import json import sys import os +import pip +import importlib +from pathlib import Path +import os try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen def download(file): url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue() + data = catalogue(False) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] if file not in corpuses and file not in models: @@ -18,12 +22,51 @@ def download(file): print("Downloading {m}".format(m=file)) subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) -def catalogue(print_list = 0): +def catalogue(print_list = True): url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" response = urlopen(url) data = response.read().decode("utf-8") if print_list == 1 : - print(json.loads(data)) + data = json.loads(data) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) else: return json.loads(data) - + +def link(file_name, shortcut_name): + packages = pip.get_installed_distributions() + package_is_installed = 0 + for package in packages: + if package.project_name == file_name: + package_is_installed = 1 + break + if package_is_installed == 0: + print("The model/corpus {f} has not been installed. Please install it using : python -m gensim -d {f}".format(f=file_name)) + package = importlib.import_module(file_name) + package_path = Path(package.__file__).parent.parent + package_path = package_path / file_name / file_name + gensim_path = importlib.import_module("gensim") + gensim_path = Path(gensim_path.__file__).parent + shortcut_path = gensim_path / 'data' / shortcut_name + + try: + os.symlink(str(package_path),str(shortcut_path)) + except: + print("Shortcut creation failed in gensim/data.") + print("Shortcut creation successful. The model/corpus can now be found in gensim/data.") + +def info(file_name): + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) \ No newline at end of file From fffe203303c57966bd725450773a64c922413c51 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 1 Aug 2017 17:08:21 +0530 Subject: [PATCH 03/24] modeified link and info functions --- gensim/__main__.py | 53 ++++++++++-------- gensim/api/__init__.py | 121 +++++++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 77 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 8f82c8089a..48f299b21e 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -7,27 +7,32 @@ from .api import link from .api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description = "Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument("-d" ,"--download", nargs = 1,help = "To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-l", "--link", nargs = 2,help=" To store a shortcut to a corpus/model : python -m gensim -l source destination") - group.add_argument("-i", "--info",nargs = 1, help = "To get information about a corpus/model : python -m gensim -i model/corpus name") - group.add_argument("-c","--catalogue", help ="To get the list of all models/corpus stored : python -m gensim -c",action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - - elif sys.argv[1] == "-l" or sys.argv[1] =="--link": - link(sys.argv[2],sys.argv[3]) - - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() - - - - - - + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( + "-d", "--download", nargs=1, + help="To download a corpus/model : python -m gensim -d " + "corpus/model name") + group.add_argument( + "-l", "--link", nargs=2, + help="To store a shortcut to a corpus/model : python -m gensim -l " + "source destination") + group.add_argument( + "-i", "--info", nargs=1, + help="To get information about a corpus/model : python -m gensim -i " + "model/corpus name") + group.add_argument( + "-c", "--catalogue", help="To get the list of all models/corpus stored" + " : python -m gensim -c", action="store_true") + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + + elif sys.argv[1] == "-l" or sys.argv[1] == "--link": + link(sys.argv[2], sys.argv[3]) + + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index d0e1e5ea2b..7c64011d52 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -10,63 +10,78 @@ from urllib.request import urlopen except ImportError: from urllib2 import urlopen + + def download(file): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file not in corpuses and file not in models: - print("Incorrect corpus/model name.") - else: - url = url+file+"/"+file+".tar.gz" - print("Downloading {m}".format(m=file)) - subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file not in corpuses and file not in models: + print("Incorrect corpus/model name.") + else: + url = url+file+"/"+file+".tar.gz" + print("Downloading {m}".format(m=file)) + subprocess.call( + [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', url], + env=os.environ.copy()) + + +def catalogue(print_list=True): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + if print_list == 1: + data = json.loads(data) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + else: + return json.loads(data) -def catalogue(print_list = True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - if print_list == 1 : - data = json.loads(data) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - else: - return json.loads(data) def link(file_name, shortcut_name): - packages = pip.get_installed_distributions() - package_is_installed = 0 - for package in packages: - if package.project_name == file_name: - package_is_installed = 1 - break - if package_is_installed == 0: - print("The model/corpus {f} has not been installed. Please install it using : python -m gensim -d {f}".format(f=file_name)) - package = importlib.import_module(file_name) - package_path = Path(package.__file__).parent.parent - package_path = package_path / file_name / file_name - gensim_path = importlib.import_module("gensim") - gensim_path = Path(gensim_path.__file__).parent - shortcut_path = gensim_path / 'data' / shortcut_name + packages = pip.get_installed_distributions() + package_is_installed = 0 + for package in packages: + if package.project_name == file_name: + package_is_installed = 1 + break + if package_is_installed == 0: + print("The model/corpus {f} has not been installed".format(f=file_name)) + print("For installing use: python -m gensim -d {f}".format(f=file_name)) + sys.exit(0) + package = importlib.import_module(file_name) + package_path = Path(package.__file__).parent.parent + package_path = package_path / file_name / file_name + gensim_path = importlib.import_module("gensim") + gensim_path = Path(gensim_path.__file__).parent + shortcut_path = gensim_path / 'data' / shortcut_name + if os.path.exists(str(shortcut_path)): + print("This shortcut link already exists.") + sys.exit(0) + try: + os.symlink(str(package_path), str(shortcut_path)) + except: + print("Shortcut creation failed in gensim/data.") + sys.exit(0) + print("Shortcut creation successful. The model/corpus can now be found" + " in gensim/data.") - try: - os.symlink(str(package_path),str(shortcut_path)) - except: - print("Shortcut creation failed in gensim/data.") - print("Shortcut creation successful. The model/corpus can now be found in gensim/data.") def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) \ No newline at end of file + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + print("Incorrect model/corpus name.") From f567dee022be5aff13d8ca595f151e1cc67da464 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 4 Aug 2017 15:18:12 +0530 Subject: [PATCH 04/24] Updated download function --- gensim/__main__.py | 10 ------ gensim/api/__init__.py | 72 +++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 57 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 48f299b21e..33236f6ccb 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -4,7 +4,6 @@ import argparse from .api import download from .api import catalogue -from .api import link from .api import info if __name__ == '__main__': parser = argparse.ArgumentParser(description="Gensim console API") @@ -13,10 +12,6 @@ "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( - "-l", "--link", nargs=2, - help="To store a shortcut to a corpus/model : python -m gensim -l " - "source destination") group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " @@ -27,12 +22,7 @@ args = parser.parse_args() if sys.argv[1] == "-d" or sys.argv[1] == "--download": download(sys.argv[2]) - - elif sys.argv[1] == "-l" or sys.argv[1] == "--link": - link(sys.argv[2], sys.argv[3]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 7c64011d52..557df6c46e 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,11 +1,14 @@ -import subprocess +from __future__ import print_function import json import sys import os -import pip -import importlib -from pathlib import Path -import os +import six +import tarfile +try: + import urllib.request as urllib +except ImportError: + import urllib + try: from urllib.request import urlopen except ImportError: @@ -13,22 +16,26 @@ def download(file): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file not in corpuses and file not in models: - print("Incorrect corpus/model name.") - else: - url = url+file+"/"+file+".tar.gz" - print("Downloading {m}".format(m=file)) - subprocess.call( - [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', url], - env=os.environ.copy()) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" + url = url+"download/"+file+"/"+file+".tar.gz" + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + extracted_folder_dir = os.path.join(base_dir, file) + if not os.path.exists(base_dir): + os.makedirs(base_dir) + compressed_folder_name = file+".tar.gz" + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + urllib.urlretrieve(url, compressed_folder_dir) + if not os.path.exists(extracted_folder_dir): + os.makedirs(extracted_folder_dir) + tar = tarfile.open(compressed_folder_dir) + tar.extractall(extracted_folder_dir) + tar.close() def catalogue(print_list=True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/" + url = url + "master/list.json" response = urlopen(url) data = response.read().decode("utf-8") if print_list == 1: @@ -46,35 +53,6 @@ def catalogue(print_list=True): return json.loads(data) -def link(file_name, shortcut_name): - packages = pip.get_installed_distributions() - package_is_installed = 0 - for package in packages: - if package.project_name == file_name: - package_is_installed = 1 - break - if package_is_installed == 0: - print("The model/corpus {f} has not been installed".format(f=file_name)) - print("For installing use: python -m gensim -d {f}".format(f=file_name)) - sys.exit(0) - package = importlib.import_module(file_name) - package_path = Path(package.__file__).parent.parent - package_path = package_path / file_name / file_name - gensim_path = importlib.import_module("gensim") - gensim_path = Path(gensim_path.__file__).parent - shortcut_path = gensim_path / 'data' / shortcut_name - if os.path.exists(str(shortcut_path)): - print("This shortcut link already exists.") - sys.exit(0) - try: - os.symlink(str(package_path), str(shortcut_path)) - except: - print("Shortcut creation failed in gensim/data.") - sys.exit(0) - print("Shortcut creation successful. The model/corpus can now be found" - " in gensim/data.") - - def info(file_name): data = catalogue(False) corpuses = data['gensim']['corpus'] From 61ba3d62b86ed1d1f017214e8a81cdc4df4a98b8 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 7 Aug 2017 11:25:05 +0530 Subject: [PATCH 05/24] Added logging --- gensim/api/__init__.py | 51 +++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 557df6c46e..099792fa5b 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,9 +1,11 @@ from __future__ import print_function + +import logging import json -import sys import os -import six import tarfile +import shutil +from ..utils import SaveLoad try: import urllib.request as urllib except ImportError: @@ -14,23 +16,52 @@ except ImportError: from urllib2 import urlopen +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename="api.log", level=logging.INFO) +console = logging.StreamHandler() +console.setLevel(logging.INFO) +logging.getLogger('').addHandler(console) + -def download(file): +def download(file_name): url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" - url = url+"download/"+file+"/"+file+".tar.gz" + url = url + "download/" + file_name + "/" + file_name + ".tar.gz" user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') - extracted_folder_dir = os.path.join(base_dir, file) + extracted_folder_dir = os.path.join(base_dir, file_name) if not os.path.exists(base_dir): + logging.info("Creating {}".format(base_dir)) os.makedirs(base_dir) - compressed_folder_name = file+".tar.gz" + if os.path.exists(base_dir): + logging.info("Creation successful. Models/corpus can be accessed" + "via {}".format(base_dir)) + else: + logging.error("Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it manually".format(d=base_dir)) + compressed_folder_name = file_name + ".tar.gz" compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - urllib.urlretrieve(url, compressed_folder_dir) if not os.path.exists(extracted_folder_dir): + logging.info("Downloading {}".format(file_name)) + urllib.urlretrieve(url, compressed_folder_dir) + logging.info("{} downloaded".format(file_name)) + logging.info("Creating {}".format(extracted_folder_dir)) os.makedirs(extracted_folder_dir) - tar = tarfile.open(compressed_folder_dir) - tar.extractall(extracted_folder_dir) - tar.close() + if os.path.exists(extracted_folder_dir): + logging.info("Creation of {} successful" + ".".format(extracted_folder_dir)) + tar = tarfile.open(compressed_folder_dir) + logging.info("Extracting files from" + "{}".format(extracted_folder_dir)) + tar.extractall(extracted_folder_dir) + tar.close() + logging.info("{} installed".format(file_name)) + else: + logging.error("Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it " + "manually".format(d=extracted_folder_dir)) + else: + print("{} has already been installed".format(file_name)) def catalogue(print_list=True): From d8257a387237208fa5fe5add7380614e079e78f0 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 11 Aug 2017 11:19:19 +0530 Subject: [PATCH 06/24] Added load function --- gensim/__main__.py | 35 +++++++++++--------- gensim/api/__init__.py | 74 +++++++++++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 41 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 33236f6ccb..5ea8e43000 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -2,27 +2,32 @@ from __future__ import absolute_import import sys import argparse -from .api import download -from .api import catalogue -from .api import info +import logging +from gensim.api import download +from gensim.api import catalogue +from gensim.api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', filename="api.log", level=logging.INFO) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + logging.getLogger('').addHandler(console) + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( + group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " "model/corpus name") - group.add_argument( + group.add_argument( "-c", "--catalogue", help="To get the list of all models/corpus stored" " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 099792fa5b..0a70fc2b39 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,11 +1,12 @@ from __future__ import print_function - -import logging +from __future__ import absolute_import import json import os import tarfile import shutil -from ..utils import SaveLoad +import logging +import sys +import importlib try: import urllib.request as urllib except ImportError: @@ -16,10 +17,7 @@ except ImportError: from urllib2 import urlopen -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename="api.log", level=logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -logging.getLogger('').addHandler(console) +logger = logging.getLogger('gensim.api') def download(file_name): @@ -29,37 +27,42 @@ def download(file_name): base_dir = os.path.join(user_dir, 'gensim-data') extracted_folder_dir = os.path.join(base_dir, file_name) if not os.path.exists(base_dir): - logging.info("Creating {}".format(base_dir)) + logger.info("Creating {}".format(base_dir)) os.makedirs(base_dir) if os.path.exists(base_dir): - logging.info("Creation successful. Models/corpus can be accessed" - "via {}".format(base_dir)) + logger.info( + "Creation successful. Models/corpus can be accessed" + "via {}".format(base_dir)) else: - logging.error("Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it manually".format(d=base_dir)) + logger.error( + "Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it manually".format(d=base_dir)) compressed_folder_name = file_name + ".tar.gz" compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) if not os.path.exists(extracted_folder_dir): - logging.info("Downloading {}".format(file_name)) + logger.info("Downloading {}".format(file_name)) urllib.urlretrieve(url, compressed_folder_dir) - logging.info("{} downloaded".format(file_name)) - logging.info("Creating {}".format(extracted_folder_dir)) + logger.info("{} downloaded".format(file_name)) + logger.info("Creating {}".format(extracted_folder_dir)) os.makedirs(extracted_folder_dir) if os.path.exists(extracted_folder_dir): - logging.info("Creation of {} successful" - ".".format(extracted_folder_dir)) + logger.info( + "Creation of {} successful" + ".".format(extracted_folder_dir)) tar = tarfile.open(compressed_folder_dir) - logging.info("Extracting files from" - "{}".format(extracted_folder_dir)) + logger.info( + "Extracting files from" + "{}".format(extracted_folder_dir)) tar.extractall(extracted_folder_dir) tar.close() - logging.info("{} installed".format(file_name)) + logger.info("{} installed".format(file_name)) else: - logging.error("Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it " - "manually".format(d=extracted_folder_dir)) + logger.error( + "Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it " + "manually".format(d=extracted_folder_dir)) else: print("{} has already been installed".format(file_name)) @@ -69,7 +72,7 @@ def catalogue(print_list=True): url = url + "master/list.json" response = urlopen(url) data = response.read().decode("utf-8") - if print_list == 1: + if print_list: data = json.loads(data) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] @@ -94,3 +97,22 @@ def info(file_name): print(data['gensim']['model'][file_name]) else: print("Incorrect model/corpus name.") + + +def load(file_name, return_path=False): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + print( + "Incorrect model/corpus name. Use catalogue() to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download({f}) function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data From 55714696a233a3aa4f2a025880b7cf60fe18cb67 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 11 Aug 2017 12:11:10 +0530 Subject: [PATCH 07/24] Removed unused imports --- gensim/__main__.py | 34 ++++++++++++++++++---------------- gensim/api/__init__.py | 5 ++--- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 5ea8e43000..c6db55ab27 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -7,27 +7,29 @@ from gensim.api import catalogue from gensim.api import info if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', filename="api.log", level=logging.INFO) - console = logging.StreamHandler() - console.setLevel(logging.INFO) - logging.getLogger('').addHandler(console) - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( + logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename="api.log", level=logging.INFO) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + logging.getLogger('').addHandler(console) + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( + group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " "model/corpus name") - group.add_argument( + group.add_argument( "-c", "--catalogue", help="To get the list of all models/corpus stored" " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 0a70fc2b39..990e90afcb 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -3,10 +3,9 @@ import json import os import tarfile -import shutil import logging import sys -import importlib + try: import urllib.request as urllib except ImportError: @@ -108,7 +107,7 @@ def load(file_name, return_path=False): "Incorrect model/corpus name. Use catalogue() to get a list of " "avalible models/corpus. If the model/corpus name you entered is" " in the catalogue, then please download the model/corpus by " - "calling download({f}) function".format(f=file_name)) + "calling download('{f}') function".format(f=file_name)) elif return_path: return folder_dir else: From cabf173a3b5f80465ef8ba3dab452fe8d197f45f Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Thu, 24 Aug 2017 00:17:28 +0530 Subject: [PATCH 08/24] added check for installed models --- gensim/__main__.py | 35 ------ gensim/api/__init__.py | 231 +++++++++++++++++++++++---------------- gensim/api/downloader.py | 19 ++++ 3 files changed, 158 insertions(+), 127 deletions(-) delete mode 100644 gensim/__main__.py create mode 100644 gensim/api/downloader.py diff --git a/gensim/__main__.py b/gensim/__main__.py deleted file mode 100644 index c6db55ab27..0000000000 --- a/gensim/__main__.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import sys -import argparse -import logging -from gensim.api import download -from gensim.api import catalogue -from gensim.api import info -if __name__ == '__main__': - logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename="api.log", level=logging.INFO) - console = logging.StreamHandler() - console.setLevel(logging.INFO) - logging.getLogger('').addHandler(console) - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( - "-d", "--download", nargs=1, - help="To download a corpus/model : python -m gensim -d " - "corpus/model name") - group.add_argument( - "-i", "--info", nargs=1, - help="To get information about a corpus/model : python -m gensim -i " - "model/corpus name") - group.add_argument( - "-c", "--catalogue", help="To get the list of all models/corpus stored" - " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 990e90afcb..334f336078 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -5,113 +5,160 @@ import tarfile import logging import sys - +import shutil +import errno try: - import urllib.request as urllib + import urllib.request as urllib except ImportError: - import urllib + import urllib try: - from urllib.request import urlopen + from urllib.request import urlopen except ImportError: - from urllib2 import urlopen + from urllib2 import urlopen + + +user_dir = os.path.expanduser('~') +base_dir = os.path.join(user_dir, 'gensim-data') +log_file_dir = os.path.join(base_dir, 'api.log') +if not os.path.isdir(base_dir): + try: + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) +logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename=log_file_dir, level=logging.INFO) +console = logging.StreamHandler() +console.setLevel(logging.INFO) logger = logging.getLogger('gensim.api') +logger.addHandler(console) def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" - url = url + "download/" + file_name + "/" + file_name + ".tar.gz" - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - extracted_folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(base_dir): - logger.info("Creating {}".format(base_dir)) - os.makedirs(base_dir) - if os.path.exists(base_dir): - logger.info( - "Creation successful. Models/corpus can be accessed" - "via {}".format(base_dir)) - else: - logger.error( - "Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it manually".format(d=base_dir)) - compressed_folder_name = file_name + ".tar.gz" - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - if not os.path.exists(extracted_folder_dir): - logger.info("Downloading {}".format(file_name)) - urllib.urlretrieve(url, compressed_folder_dir) - logger.info("{} downloaded".format(file_name)) - logger.info("Creating {}".format(extracted_folder_dir)) - os.makedirs(extracted_folder_dir) - if os.path.exists(extracted_folder_dir): - logger.info( - "Creation of {} successful" - ".".format(extracted_folder_dir)) - tar = tarfile.open(compressed_folder_dir) - logger.info( - "Extracting files from" - "{}".format(extracted_folder_dir)) - tar.extractall(extracted_folder_dir) - tar.close() - logger.info("{} installed".format(file_name)) - else: - logger.error( - "Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it " - "manually".format(d=extracted_folder_dir)) - else: - print("{} has already been installed".format(file_name)) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) + data_folder_dir = os.path.join(base_dir, file_name) + data = catalogue(print_list=False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name not in corpuses and file_name not in models: + logger.error( + "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" + " python -m gensim -c to get a list of models/corpuses" + " available.") + sys.exit(0) + compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + is_installed = False + is_downloaded = False + installed_message = "{f} installed".format(f=file_name) + downloaded_message = "{f} downloaded".format(f=file_name) + if os.path.exists(data_folder_dir): + log_file_dir = os.path.join(base_dir, 'api.log') + with open(log_file_dir) as f: + f = f.readlines() + for line in f: + if installed_message in line: + print("{} has already been installed".format(file_name)) + is_installed = True + sys.exit(0) + if os.path.exists(data_folder_dir) and not is_installed: + shutil.rmtree(data_folder_dir) + for line in f: + if downloaded_message in line: + is_downloaded = True + break + if not is_downloaded: + os.makedirs(data_folder_dir) + logger.info("Downloading %s", file_name) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(file_name) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", file_name) + if not is_installed: + logger.info("Creating %s", data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + logger.info("%s installed", file_name) + else: + logger.error( + "Not able to create %s. Make sure you have the correct read/" + "write permissions for %s or you can try creating it manually", + data_folder_dir, base_dir) -def catalogue(print_list=True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/" - url = url + "master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - if print_list: - data = json.loads(data) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - else: - return json.loads(data) +def catalogue(print_list=False): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if print_list: + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + return data def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) - else: - print("Incorrect model/corpus name.") + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + catalogue(print_list=True) + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "above.") def load(file_name, return_path=False): - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(folder_dir): - print( - "Incorrect model/corpus name. Use catalogue() to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) - elif return_path: - return folder_dir - else: - sys.path.insert(0, base_dir) - module = __import__(file_name) - data = module.load_data() - return data + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + raise Exception( + "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download('{f}') function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data + + +def data_links(file_name): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if file_name in data['data_links']: + return data['data_links'][file_name]['link'] diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py new file mode 100644 index 0000000000..834628fe7f --- /dev/null +++ b/gensim/api/downloader.py @@ -0,0 +1,19 @@ +from __future__ import print_function +from __future__ import absolute_import +import argparse +from gensim.api import download +from gensim.api import catalogue +from gensim.api import info +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") + args = parser.parse_args() + if args.download is not None: + download(args.download[0]) + elif args.info is not None: + info(args.info[0]) + elif args.catalogue is not None: + catalogue(print_list=True) From 5d509fc92476b1fb621d25e5637a298a32c1ca15 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Sun, 27 Aug 2017 19:43:53 +0530 Subject: [PATCH 09/24] updated download function --- gensim/api/__init__.py | 246 ++++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 124 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 334f336078..370af50573 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -5,39 +5,38 @@ import tarfile import logging import sys -import shutil import errno try: - import urllib.request as urllib + import urllib.request as urllib except ImportError: - import urllib + import urllib try: - from urllib.request import urlopen + from urllib.request import urlopen except ImportError: - from urllib2 import urlopen + from urllib2 import urlopen user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') log_file_dir = os.path.join(base_dir, 'api.log') if not os.path.isdir(base_dir): - try: - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) + try: + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename=log_file_dir, level=logging.INFO) + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename=log_file_dir, level=logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) logger = logging.getLogger('gensim.api') @@ -45,120 +44,119 @@ def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) - data_folder_dir = os.path.join(base_dir, file_name) - data = catalogue(print_list=False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name not in corpuses and file_name not in models: - logger.error( - "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" - " python -m gensim -c to get a list of models/corpuses" - " available.") - sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=file_name) - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - is_installed = False - is_downloaded = False - installed_message = "{f} installed".format(f=file_name) - downloaded_message = "{f} downloaded".format(f=file_name) - if os.path.exists(data_folder_dir): - log_file_dir = os.path.join(base_dir, 'api.log') - with open(log_file_dir) as f: - f = f.readlines() - for line in f: - if installed_message in line: - print("{} has already been installed".format(file_name)) - is_installed = True - sys.exit(0) - if os.path.exists(data_folder_dir) and not is_installed: - shutil.rmtree(data_folder_dir) - for line in f: - if downloaded_message in line: - is_downloaded = True - break - if not is_downloaded: - os.makedirs(data_folder_dir) - logger.info("Downloading %s", file_name) - urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(file_name) - if data_url is not None: - index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index+1:]) - urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", file_name) - if not is_installed: - logger.info("Creating %s", data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - tar = tarfile.open(compressed_folder_dir) - logger.info("Extracting files from %s", data_folder_dir) - tar.extractall(data_folder_dir) - tar.close() - logger.info("%s installed", file_name) - else: - logger.error( - "Not able to create %s. Make sure you have the correct read/" - "write permissions for %s or you can try creating it manually", - data_folder_dir, base_dir) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) + data_folder_dir = os.path.join(base_dir, file_name) + data = catalogue(print_list=False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name not in corpuses and file_name not in models: + logger.error( + "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" + " python -m gensim -c to get a list of models/corpuses" + " available.") + sys.exit(0) + compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + is_installed = False + is_downloaded = False + installed_message = "{f} installed".format(f=file_name) + downloaded_message = "{f} downloaded".format(f=file_name) + if os.path.exists(data_folder_dir): + log_file_dir = os.path.join(base_dir, 'api.log') + with open(log_file_dir) as f: + f = f.readlines() + for line in f: + if installed_message in line: + print("{} has already been installed".format(file_name)) + is_installed = True + sys.exit(0) + if os.path.exists(data_folder_dir) and not is_installed: + for line in f: + if downloaded_message in line: + is_downloaded = True + break + if not is_downloaded: + if not os.path.exists(data_folder_dir): + logger.info("Creating %s", data_folder_dir) + os.makedirs(data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + else: + logger.error( + "Not able to create %s. Make sure you have the correct read/" + "write permissions for %s or you can try creating it manually", + data_folder_dir, base_dir) + sys.exit(0) + logger.info("Downloading %s", file_name) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(file_name) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", file_name) + if not is_installed: + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + logger.info("%s installed", file_name) def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if print_list: - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - return data + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if print_list: + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + return data def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) - else: - catalogue(print_list=True) - raise Exception( - "Incorrect model/corpus name. Choose the model/corpus from the list " - "above.") + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + catalogue(print_list=True) + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "above.") def load(file_name, return_path=False): - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(folder_dir): - raise Exception( - "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) - elif return_path: - return folder_dir - else: - sys.path.insert(0, base_dir) - module = __import__(file_name) - data = module.load_data() - return data + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + raise Exception( + "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download('{f}') function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data def data_links(file_name): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if file_name in data['data_links']: - return data['data_links'][file_name]['link'] + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if file_name in data['data_links']: + return data['data_links'][file_name]['link'] From 551f54ef3f9ae53cb973cba16d0a6627dcdd674e Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 4 Sep 2017 22:11:32 +0530 Subject: [PATCH 10/24] Improved help for terminal --- gensim/api/__init__.py | 2 +- gensim/api/downloader.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 370af50573..15a3359936 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -92,7 +92,7 @@ def download(file_name): data_url = data_links(file_name) if data_url is not None: index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) logger.info("%s downloaded", file_name) if not is_installed: diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py index 834628fe7f..0accff9f8c 100644 --- a/gensim/api/downloader.py +++ b/gensim/api/downloader.py @@ -5,10 +5,10 @@ from gensim.api import catalogue from gensim.api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API") + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") args = parser.parse_args() if args.download is not None: From ff5509ffdfa7b189379af995938509fdc6868dbc Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 6 Sep 2017 13:29:14 +0530 Subject: [PATCH 11/24] load returns model path --- gensim/api/__init__.py | 66 +++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 15a3359936..a1ad63ebb4 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -43,31 +43,31 @@ logger.addHandler(console) -def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) - data_folder_dir = os.path.join(base_dir, file_name) +def download(dataset): + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) + data_folder_dir = os.path.join(base_dir, dataset) data = catalogue(print_list=False) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] - if file_name not in corpuses and file_name not in models: + if dataset not in corpuses and dataset not in models: logger.error( "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" " python -m gensim -c to get a list of models/corpuses" " available.") sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_name = "{f}.tar.gz".format(f=dataset) compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) is_installed = False is_downloaded = False - installed_message = "{f} installed".format(f=file_name) - downloaded_message = "{f} downloaded".format(f=file_name) + installed_message = "{f} installed".format(f=dataset) + downloaded_message = "{f} downloaded".format(f=dataset) if os.path.exists(data_folder_dir): log_file_dir = os.path.join(base_dir, 'api.log') with open(log_file_dir) as f: f = f.readlines() for line in f: if installed_message in line: - print("{} has already been installed".format(file_name)) + print("{} has already been installed".format(dataset)) is_installed = True sys.exit(0) if os.path.exists(data_folder_dir) and not is_installed: @@ -87,24 +87,24 @@ def download(file_name): "write permissions for %s or you can try creating it manually", data_folder_dir, base_dir) sys.exit(0) - logger.info("Downloading %s", file_name) + logger.info("Downloading %s", dataset) urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(file_name) + data_url = data_links(dataset) if data_url is not None: index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", file_name) + logger.info("%s downloaded", dataset) if not is_installed: tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - logger.info("%s installed", file_name) + logger.info("%s installed", dataset) def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) @@ -121,14 +121,14 @@ def catalogue(print_list=False): return data -def info(file_name): - data = catalogue(False) +def info(dataset): + data = catalogue() corpuses = data['gensim']['corpus'] models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) + if dataset in corpuses: + print(data['gensim']['corpus'][dataset]["desc"]) + elif dataset in models: + print(data['gensim']['model'][dataset]["desc"]) else: catalogue(print_list=True) raise Exception( @@ -136,27 +136,39 @@ def info(file_name): "above.") -def load(file_name, return_path=False): - folder_dir = os.path.join(base_dir, file_name) +def get_filename(dataset): + data = catalogue() + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpuses: + return data['gensim']['corpus'][dataset]["filename"] + elif dataset in models: + return data['gensim']['model'][dataset]["filename"] + + +def load(dataset, return_path=False): + file_name = get_filename(dataset) + folder_dir = os.path.join(base_dir, dataset) + file_dir = os.path.join(folder_dir, file_name) if not os.path.exists(folder_dir): raise Exception( "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " "avalible models/corpus. If the model/corpus name you entered is" " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) + "calling download('{f}') function".format(f=dataset)) elif return_path: - return folder_dir + return file_dir else: sys.path.insert(0, base_dir) - module = __import__(file_name) + module = __import__(dataset) data = module.load_data() return data -def data_links(file_name): +def data_links(dataset): url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if file_name in data['data_links']: - return data['data_links'][file_name]['link'] + if dataset in data['data_links']: + return data['data_links'][dataset]['link'] From e6540703ae9d4f3557fa3080f05ee4250e2bbc7a Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 19 Sep 2017 22:09:26 +0530 Subject: [PATCH 12/24] added jupyter notebook and merged code --- docs/notebooks/API_tutorial.ipynb | 402 ++++++++++++++++++++++++++++++ gensim/api/__init__.py | 174 ------------- gensim/api/downloader.py | 19 -- gensim/downloader.py | 298 ++++++++++++++++++++++ 4 files changed, 700 insertions(+), 193 deletions(-) create mode 100644 docs/notebooks/API_tutorial.ipynb delete mode 100644 gensim/api/__init__.py delete mode 100644 gensim/api/downloader.py create mode 100644 gensim/downloader.py diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb new file mode 100644 index 0000000000..2f5b6d7bbd --- /dev/null +++ b/docs/notebooks/API_tutorial.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial for using Gensim's API for downloading corpuses/models\n", + "Let's start by importing the api module." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gensim.downloader as api" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, lets download the text8 corpus and load it." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:40:13,699 :gensim.api :INFO : Creating /home/chaitali/gensim-data/text8\n", + "2017-09-19 17:40:13,707 :gensim.api :INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", + "2017-09-19 17:40:13,713 :gensim.api :INFO : Downloading text8\n", + "2017-09-19 17:46:24,545 :gensim.api :INFO : text8 downloaded\n", + "2017-09-19 17:46:24,560 :gensim.api :INFO : Extracting files from /home/chaitali/gensim-data/text8\n", + "2017-09-19 17:46:26,676 :gensim.api :INFO : text8 installed\n" + ] + } + ], + "source": [ + "corpus = api.load('text8')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the corpus has been installed, let's create a word2vec model of our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:46:39,030 :gensim.models.word2vec :INFO : collecting all words and their counts\n", + "2017-09-19 17:46:39,037 :gensim.models.word2vec :INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-09-19 17:46:46,104 :gensim.models.word2vec :INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-09-19 17:46:46,105 :gensim.models.word2vec :INFO : Loading a fresh vocabulary\n", + "2017-09-19 17:46:46,393 :gensim.models.word2vec :INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-09-19 17:46:46,394 :gensim.models.word2vec :INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-09-19 17:46:46,607 :gensim.models.word2vec :INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-09-19 17:46:46,618 :gensim.models.word2vec :INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-09-19 17:46:46,620 :gensim.models.word2vec :INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-09-19 17:46:46,621 :gensim.models.word2vec :INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-09-19 17:46:46,946 :gensim.models.word2vec :INFO : resetting layer weights\n", + "2017-09-19 17:46:48,052 :gensim.models.word2vec :INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-09-19 17:46:49,058 :gensim.models.word2vec :INFO : PROGRESS: at 1.14% examples, 707464 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:50,070 :gensim.models.word2vec :INFO : PROGRESS: at 2.33% examples, 716418 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:51,081 :gensim.models.word2vec :INFO : PROGRESS: at 3.50% examples, 720064 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:52,084 :gensim.models.word2vec :INFO : PROGRESS: at 4.68% examples, 724069 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:53,087 :gensim.models.word2vec :INFO : PROGRESS: at 5.83% examples, 724165 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:54,090 :gensim.models.word2vec :INFO : PROGRESS: at 7.00% examples, 726206 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:46:55,094 :gensim.models.word2vec :INFO : PROGRESS: at 8.15% examples, 725286 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:56,096 :gensim.models.word2vec :INFO : PROGRESS: at 9.30% examples, 724925 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:46:57,114 :gensim.models.word2vec :INFO : PROGRESS: at 10.18% examples, 704674 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:58,130 :gensim.models.word2vec :INFO : PROGRESS: at 11.37% examples, 707549 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:59,136 :gensim.models.word2vec :INFO : PROGRESS: at 12.30% examples, 696048 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:00,140 :gensim.models.word2vec :INFO : PROGRESS: at 13.47% examples, 699081 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:01,150 :gensim.models.word2vec :INFO : PROGRESS: at 14.63% examples, 700492 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:02,151 :gensim.models.word2vec :INFO : PROGRESS: at 15.79% examples, 701182 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:03,156 :gensim.models.word2vec :INFO : PROGRESS: at 16.95% examples, 702555 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:04,161 :gensim.models.word2vec :INFO : PROGRESS: at 18.13% examples, 704501 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:05,162 :gensim.models.word2vec :INFO : PROGRESS: at 19.04% examples, 696025 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:06,176 :gensim.models.word2vec :INFO : PROGRESS: at 19.96% examples, 689054 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:07,183 :gensim.models.word2vec :INFO : PROGRESS: at 21.15% examples, 691295 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:08,185 :gensim.models.word2vec :INFO : PROGRESS: at 22.30% examples, 692243 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:09,192 :gensim.models.word2vec :INFO : PROGRESS: at 23.47% examples, 693696 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:10,211 :gensim.models.word2vec :INFO : PROGRESS: at 24.64% examples, 695112 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:11,219 :gensim.models.word2vec :INFO : PROGRESS: at 25.77% examples, 695601 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:12,237 :gensim.models.word2vec :INFO : PROGRESS: at 26.90% examples, 695914 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:13,241 :gensim.models.word2vec :INFO : PROGRESS: at 28.09% examples, 697735 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:14,260 :gensim.models.word2vec :INFO : PROGRESS: at 29.25% examples, 698509 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:15,260 :gensim.models.word2vec :INFO : PROGRESS: at 30.36% examples, 698320 words/s, in_qsize 5, out_qsize 2\n", + "2017-09-19 17:47:16,273 :gensim.models.word2vec :INFO : PROGRESS: at 31.15% examples, 690867 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:17,293 :gensim.models.word2vec :INFO : PROGRESS: at 31.72% examples, 679085 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:47:18,311 :gensim.models.word2vec :INFO : PROGRESS: at 32.31% examples, 668474 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-19 17:47:19,318 :gensim.models.word2vec :INFO : PROGRESS: at 32.98% examples, 660311 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:20,320 :gensim.models.word2vec :INFO : PROGRESS: at 34.05% examples, 660626 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:21,328 :gensim.models.word2vec :INFO : PROGRESS: at 35.19% examples, 661837 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:22,350 :gensim.models.word2vec :INFO : PROGRESS: at 36.01% examples, 656869 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:23,358 :gensim.models.word2vec :INFO : PROGRESS: at 36.74% examples, 651028 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:24,365 :gensim.models.word2vec :INFO : PROGRESS: at 37.44% examples, 644886 words/s, in_qsize 3, out_qsize 1\n", + "2017-09-19 17:47:25,381 :gensim.models.word2vec :INFO : PROGRESS: at 38.11% examples, 638630 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:26,384 :gensim.models.word2vec :INFO : PROGRESS: at 38.73% examples, 631930 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-19 17:47:27,386 :gensim.models.word2vec :INFO : PROGRESS: at 39.56% examples, 629025 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:28,403 :gensim.models.word2vec :INFO : PROGRESS: at 40.40% examples, 626116 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:29,412 :gensim.models.word2vec :INFO : PROGRESS: at 41.48% examples, 626962 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:30,422 :gensim.models.word2vec :INFO : PROGRESS: at 42.68% examples, 629634 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:31,439 :gensim.models.word2vec :INFO : PROGRESS: at 43.62% examples, 628434 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:32,451 :gensim.models.word2vec :INFO : PROGRESS: at 44.30% examples, 623710 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:33,494 :gensim.models.word2vec :INFO : PROGRESS: at 45.24% examples, 622439 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:34,500 :gensim.models.word2vec :INFO : PROGRESS: at 46.14% examples, 621141 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:35,501 :gensim.models.word2vec :INFO : PROGRESS: at 47.16% examples, 621680 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:36,502 :gensim.models.word2vec :INFO : PROGRESS: at 47.96% examples, 619190 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:37,507 :gensim.models.word2vec :INFO : PROGRESS: at 49.01% examples, 619861 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:38,516 :gensim.models.word2vec :INFO : PROGRESS: at 49.65% examples, 615540 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:39,538 :gensim.models.word2vec :INFO : PROGRESS: at 50.32% examples, 611462 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:40,554 :gensim.models.word2vec :INFO : PROGRESS: at 51.06% examples, 608521 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:41,559 :gensim.models.word2vec :INFO : PROGRESS: at 52.02% examples, 608259 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:42,575 :gensim.models.word2vec :INFO : PROGRESS: at 52.82% examples, 606121 words/s, in_qsize 5, out_qsize 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:47:43,592 :gensim.models.word2vec :INFO : PROGRESS: at 53.52% examples, 602970 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:44,647 :gensim.models.word2vec :INFO : PROGRESS: at 54.27% examples, 600118 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:47:45,650 :gensim.models.word2vec :INFO : PROGRESS: at 55.16% examples, 599088 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:46,652 :gensim.models.word2vec :INFO : PROGRESS: at 56.32% examples, 601102 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-19 17:47:47,668 :gensim.models.word2vec :INFO : PROGRESS: at 57.50% examples, 603159 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:48,671 :gensim.models.word2vec :INFO : PROGRESS: at 58.66% examples, 605176 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-19 17:47:49,677 :gensim.models.word2vec :INFO : PROGRESS: at 59.84% examples, 607140 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:50,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.02% examples, 609189 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:51,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.96% examples, 608709 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:52,688 :gensim.models.word2vec :INFO : PROGRESS: at 62.94% examples, 608671 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:53,697 :gensim.models.word2vec :INFO : PROGRESS: at 63.97% examples, 609169 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:54,707 :gensim.models.word2vec :INFO : PROGRESS: at 65.16% examples, 611179 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:55,708 :gensim.models.word2vec :INFO : PROGRESS: at 66.31% examples, 612923 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:56,709 :gensim.models.word2vec :INFO : PROGRESS: at 67.30% examples, 613086 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:57,718 :gensim.models.word2vec :INFO : PROGRESS: at 68.48% examples, 614770 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:58,725 :gensim.models.word2vec :INFO : PROGRESS: at 69.42% examples, 614374 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:59,730 :gensim.models.word2vec :INFO : PROGRESS: at 70.35% examples, 613877 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:00,753 :gensim.models.word2vec :INFO : PROGRESS: at 71.09% examples, 611666 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:01,767 :gensim.models.word2vec :INFO : PROGRESS: at 72.06% examples, 611557 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:02,774 :gensim.models.word2vec :INFO : PROGRESS: at 72.92% examples, 610505 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:03,778 :gensim.models.word2vec :INFO : PROGRESS: at 73.80% examples, 609723 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:04,784 :gensim.models.word2vec :INFO : PROGRESS: at 74.81% examples, 610019 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:05,787 :gensim.models.word2vec :INFO : PROGRESS: at 75.87% examples, 610421 words/s, in_qsize 3, out_qsize 2\n", + "2017-09-19 17:48:06,795 :gensim.models.word2vec :INFO : PROGRESS: at 76.87% examples, 610533 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:07,808 :gensim.models.word2vec :INFO : PROGRESS: at 78.06% examples, 612109 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:08,815 :gensim.models.word2vec :INFO : PROGRESS: at 79.07% examples, 612233 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:09,823 :gensim.models.word2vec :INFO : PROGRESS: at 80.27% examples, 613807 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:10,833 :gensim.models.word2vec :INFO : PROGRESS: at 81.26% examples, 613696 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:11,842 :gensim.models.word2vec :INFO : PROGRESS: at 82.45% examples, 615110 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:12,851 :gensim.models.word2vec :INFO : PROGRESS: at 83.61% examples, 616375 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:48:13,853 :gensim.models.word2vec :INFO : PROGRESS: at 84.77% examples, 617712 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:14,879 :gensim.models.word2vec :INFO : PROGRESS: at 85.67% examples, 616893 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:15,890 :gensim.models.word2vec :INFO : PROGRESS: at 86.64% examples, 616837 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:16,906 :gensim.models.word2vec :INFO : PROGRESS: at 87.69% examples, 617193 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:17,915 :gensim.models.word2vec :INFO : PROGRESS: at 88.79% examples, 617970 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:18,917 :gensim.models.word2vec :INFO : PROGRESS: at 89.78% examples, 618000 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:19,918 :gensim.models.word2vec :INFO : PROGRESS: at 90.91% examples, 618999 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:20,922 :gensim.models.word2vec :INFO : PROGRESS: at 91.96% examples, 619363 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:21,936 :gensim.models.word2vec :INFO : PROGRESS: at 92.85% examples, 618640 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:22,943 :gensim.models.word2vec :INFO : PROGRESS: at 93.84% examples, 618597 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:23,958 :gensim.models.word2vec :INFO : PROGRESS: at 94.85% examples, 618687 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:24,974 :gensim.models.word2vec :INFO : PROGRESS: at 95.90% examples, 618756 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:25,978 :gensim.models.word2vec :INFO : PROGRESS: at 96.93% examples, 619000 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:26,980 :gensim.models.word2vec :INFO : PROGRESS: at 97.93% examples, 619074 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:27,982 :gensim.models.word2vec :INFO : PROGRESS: at 98.99% examples, 619431 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:28,823 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-09-19 17:48:28,830 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-09-19 17:48:28,836 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-09-19 17:48:28,837 :gensim.models.word2vec :INFO : training on 85026035 raw words (62526300 effective words) took 100.8s, 620401 effective words/s\n" + ] + } + ], + "source": [ + "from gensim.models.word2vec import Word2Vec\n", + "model = Word2Vec(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our word2vec model, let's find words that are similar to 'tree'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:48:45,837 :gensim.models.keyedvectors :INFO : precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "[('leaf', 0.7284336090087891),\n", + " ('trees', 0.7024068236351013),\n", + " ('bark', 0.6984879970550537),\n", + " ('fruit', 0.623538613319397),\n", + " ('flower', 0.6177238821983337),\n", + " ('nest', 0.6133654713630676),\n", + " ('garden', 0.5962027311325073),\n", + " ('avl', 0.5909914374351501),\n", + " ('cave', 0.5902420282363892),\n", + " ('pond', 0.5827507972717285)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar('tree')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the API to download many corpuses and models. You can get the list of all the models and corpuses that are provided, by using the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"gensim\": {\n", + " \"model\": {\n", + " \"Google_News_word2vec\": {\n", + " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", + " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", + " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", + " },\n", + " \"fasttext_eng_model\": {\n", + " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", + " \"filename\": \"wiki.en.vec\",\n", + " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", + " },\n", + " \"glove_common_crawl_42B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.42B.300d.zip\",\n", + " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", + " },\n", + " \"glove_common_crawl_840B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.840B.300d.zip\",\n", + " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", + " },\n", + " \"glove_wiki_gigaword_300d\": {\n", + " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.300d.txt\",\n", + " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", + " },\n", + " \"glove_wiki_gigaword_200d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.200d.txt\",\n", + " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", + " },\n", + " \"glove_wiki_gigaword_100d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.100d.txt\",\n", + " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", + " },\n", + " \"glove_wiki_gigaword_50d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.50d.txt\",\n", + " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", + " },\n", + " \"glove_twitter_200d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.200d.txt\",\n", + " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", + " },\n", + " \"glove_twitter_100d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.100d.txt\",\n", + " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", + " },\n", + " \"glove_twitter_50d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.50d.txt\",\n", + " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", + " },\n", + " \"glove_twitter_25d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.25d.txt\",\n", + " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", + " }\n", + " },\n", + " \"corpus\": {\n", + " \"text8\": {\n", + " \"desc\": \"Wikipedia English corpus\",\n", + " \"filename\": \"text8\",\n", + " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", + " }\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "dataset_list = api.info()\n", + "print(json.dumps(dataset_list, indent=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get detailed information about the model/corpus, use:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 21:56:42,071 :gensim.api :INFO : This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. \n", + "\n" + ] + } + ], + "source": [ + "api.info('glove_common_crawl_42B')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, you do not want to load the corpus/model to memory. You would just want to get the path to the corpus/model. For that, use :" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "text8_path = api.load('text8', return_path=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py deleted file mode 100644 index a1ad63ebb4..0000000000 --- a/gensim/api/__init__.py +++ /dev/null @@ -1,174 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import json -import os -import tarfile -import logging -import sys -import errno -try: - import urllib.request as urllib -except ImportError: - import urllib - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - - -user_dir = os.path.expanduser('~') -base_dir = os.path.join(user_dir, 'gensim-data') -log_file_dir = os.path.join(base_dir, 'api.log') -if not os.path.isdir(base_dir): - try: - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) - -logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename=log_file_dir, level=logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -logger = logging.getLogger('gensim.api') -logger.addHandler(console) - - -def download(dataset): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) - data_folder_dir = os.path.join(base_dir, dataset) - data = catalogue(print_list=False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset not in corpuses and dataset not in models: - logger.error( - "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" - " python -m gensim -c to get a list of models/corpuses" - " available.") - sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=dataset) - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - is_installed = False - is_downloaded = False - installed_message = "{f} installed".format(f=dataset) - downloaded_message = "{f} downloaded".format(f=dataset) - if os.path.exists(data_folder_dir): - log_file_dir = os.path.join(base_dir, 'api.log') - with open(log_file_dir) as f: - f = f.readlines() - for line in f: - if installed_message in line: - print("{} has already been installed".format(dataset)) - is_installed = True - sys.exit(0) - if os.path.exists(data_folder_dir) and not is_installed: - for line in f: - if downloaded_message in line: - is_downloaded = True - break - if not is_downloaded: - if not os.path.exists(data_folder_dir): - logger.info("Creating %s", data_folder_dir) - os.makedirs(data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - else: - logger.error( - "Not able to create %s. Make sure you have the correct read/" - "write permissions for %s or you can try creating it manually", - data_folder_dir, base_dir) - sys.exit(0) - logger.info("Downloading %s", dataset) - urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(dataset) - if data_url is not None: - index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) - urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", dataset) - if not is_installed: - tar = tarfile.open(compressed_folder_dir) - logger.info("Extracting files from %s", data_folder_dir) - tar.extractall(data_folder_dir) - tar.close() - logger.info("%s installed", dataset) - - -def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if print_list: - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - return data - - -def info(dataset): - data = catalogue() - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpuses: - print(data['gensim']['corpus'][dataset]["desc"]) - elif dataset in models: - print(data['gensim']['model'][dataset]["desc"]) - else: - catalogue(print_list=True) - raise Exception( - "Incorrect model/corpus name. Choose the model/corpus from the list " - "above.") - - -def get_filename(dataset): - data = catalogue() - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpuses: - return data['gensim']['corpus'][dataset]["filename"] - elif dataset in models: - return data['gensim']['model'][dataset]["filename"] - - -def load(dataset, return_path=False): - file_name = get_filename(dataset) - folder_dir = os.path.join(base_dir, dataset) - file_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir): - raise Exception( - "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=dataset)) - elif return_path: - return file_dir - else: - sys.path.insert(0, base_dir) - module = __import__(dataset) - data = module.load_data() - return data - - -def data_links(dataset): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if dataset in data['data_links']: - return data['data_links'][dataset]['link'] diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py deleted file mode 100644 index 0accff9f8c..0000000000 --- a/gensim/api/downloader.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import argparse -from gensim.api import download -from gensim.api import catalogue -from gensim.api import info -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") - group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") - group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") - args = parser.parse_args() - if args.download is not None: - download(args.download[0]) - elif args.info is not None: - info(args.info[0]) - elif args.catalogue is not None: - catalogue(print_list=True) diff --git a/gensim/downloader.py b/gensim/downloader.py new file mode 100644 index 0000000000..8edd3062b6 --- /dev/null +++ b/gensim/downloader.py @@ -0,0 +1,298 @@ +from __future__ import absolute_import +import argparse +import json +import os +import tarfile +import logging +import sys +import errno +import hashlib +try: + import urllib.request as urllib +except ImportError: + import urllib + +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen + +user_dir = os.path.expanduser('~') +base_dir = os.path.join(user_dir, 'gensim-data') +data_log_file_dir = os.path.join(base_dir, 'data.json') + +logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger('gensim.api') + +if not os.path.isdir(base_dir): + try: + logger.info("Creating %s", base_dir) + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) + + +def initialize_data_log_file(): + """Function for initializing the log file. Creates a json object + for each corpus/model and stores in the log file. For eg: {"name": "text8", "status" : "None"} + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + json_list = [] + for corpus in corpora: + json_object = {"name": corpus, "status": "None"} + json_list.append(json_object) + for model in models: + json_object = {"name": model, "status": "None"} + json_list.append(json_object) + json.dump(json_list, data_log_file) + data_log_file.close() + + +def update_data_log_file(dataset, status): + """Function for updating the status of the dataset json object. + + Args: + dataset(string): Name of the corpus/model. + status(string): Status to be updates to i.e downloaded or installed. + """ + jdata = json.loads(open(data_log_file_dir).read()) + for json_object in jdata: + if json_object["name"] == dataset: + json_object["status"] = status + with open(data_log_file_dir, 'w') as f: + f.write(json.dumps(jdata)) + + +def get_data_status(dataset): + """Function for finding the status of the dataset. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: returns the current status of the corpus/model i.e None, downloaded or installed. + """ + jdata = json.loads(open(data_log_file_dir).read()) + for json_object in jdata: + if json_object["name"] == dataset: + return json_object["status"] + + +def calculate_md5_checksum(folder_dir): + """Function for calculating checksum of a downloaded model/corpus. + + Args: + folder_dir(string): Path to the downloaded model. + + Returns: + string: It returns the value for the checksum for folder_dir directory + """ + hash_md5 = hashlib.md5() + for filename in os.listdir(folder_dir): + file_dir = os.path.join(folder_dir, filename) + with open(file_dir, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def info(dataset=None): + """Function for retrieving the list of corpora/models, if dataset is not provided. If dataset + is provided, then it gives detailed information about the dataset. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + : It returns the models/corpora names with detailed information about each. + """ + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if dataset is not None: + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + logger.info("%s \n", data['gensim']['corpus'][dataset]["desc"]) + elif dataset in models: + logger.info("%s \n", data['gensim']['model'][dataset]["desc"]) + else: + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "\n {}".format(json.dumps(data, indent=4))) + else: + return data + + +def get_checksum(dataset): + """Function for retrieving the checksum of a corpus/model + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: It returns the checksum for corresponding the corpus/model. + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + return data['gensim']['corpus'][dataset]["checksum"] + elif dataset in models: + return data['gensim']['model'][dataset]["checksum"] + + +if not os.path.isfile(data_log_file_dir): + try: + logger.warning("Creating %s", data_log_file_dir) + data_log_file = open(data_log_file_dir, 'a') + initialize_data_log_file() + except: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the file manually" + .format(data_log_file_dir)) + + +def _download(dataset): + """Function for downloading and installed dataset depending upon it's current status. + + Args: + dataset(string): Name of the corpus/model. + """ + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) + data_folder_dir = os.path.join(base_dir, dataset) + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset not in corpora and dataset not in models: + raise Exception( + "Incorect Model/corpus name. Use info() or" + " python -m gensim.downloader -c to get a list of models/corpora" + " available.") + compressed_folder_name = "{f}.tar.gz".format(f=dataset) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + if get_data_status(dataset) != "downloaded": + if not os.path.exists(data_folder_dir): + logger.info("Creating %s", data_folder_dir) + os.makedirs(data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + else: + raise Exception( + "Not able to create {a}. Make sure you have the correct read/" + "write permissions for {b} or you can try creating it manually". + format(a=data_folder_dir, b=base_dir)) + logger.info("Downloading %s", dataset) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(dataset) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", dataset) + update_data_log_file(dataset, status="downloaded") + if get_data_status(dataset) != "installed": + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + if calculate_md5_checksum(data_folder_dir) == get_checksum(dataset): + update_data_log_file(dataset, status="installed") + logger.info("%s installed", dataset) + else: + logger.error("There was a problem in installing the file. Retrying.") + _download(dataset) + + +def get_filename(dataset): + """Function of retrieving the filename of corpus/model. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: Returns the filename of the model/corpus. + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + return data['gensim']['corpus'][dataset]["filename"] + elif dataset in models: + return data['gensim']['model'][dataset]["filename"] + + +def load(dataset, return_path=False): + """Loads the corpus/model to the memory, if return_path is False. + + Args: + dataset(string): Name of the corpus/model. + return_path(bool): Determines whether to return model/corpus file path. + + Returns: + string: Returns the path to the model/corpus, if return_path is True. + """ + file_name = get_filename(dataset) + if file_name is None: + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "\n {}".format(json.dumps(info(), indent=4))) + folder_dir = os.path.join(base_dir, dataset) + file_dir = os.path.join(folder_dir, file_name) + if not os.path.exists(folder_dir) or get_data_status(dataset) != "installed": + _download(dataset) + if return_path: + return file_dir + else: + sys.path.insert(0, base_dir) + module = __import__(dataset) + data = module.load_data() + return data + + +def data_links(dataset): + """Function for retrieving the links of the models/corpus which are not stored in github releases + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: Returns the link of the model/corpus. + """ + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if dataset in data['data_links']: + return data['data_links'][dataset]['link'] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") + args = parser.parse_args() + if args.download is not None: + data_path = load(args.download[0], return_path=True) + logger.info("Data has been installed and data path is %s", data_path) + elif args.info is not None: + info(dataset=args.info[0]) + elif args.catalogue is not None: + data = info() + logger.info("%s\n", json.dumps(data, indent=4)) From b0d1110fb4f51b06821285f57a168212a1b7b70b Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 3 Oct 2017 18:27:17 +0530 Subject: [PATCH 13/24] alternate names for load --- docs/notebooks/API_tutorial.ipynb | 424 +++++++++++++++--------------- gensim/downloader.py | 272 +++++++++++-------- 2 files changed, 368 insertions(+), 328 deletions(-) diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb index 2f5b6d7bbd..5b9d28c59b 100644 --- a/docs/notebooks/API_tutorial.ipynb +++ b/docs/notebooks/API_tutorial.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -28,23 +28,25 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:40:13,699 :gensim.api :INFO : Creating /home/chaitali/gensim-data/text8\n", - "2017-09-19 17:40:13,707 :gensim.api :INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", - "2017-09-19 17:40:13,713 :gensim.api :INFO : Downloading text8\n", - "2017-09-19 17:46:24,545 :gensim.api :INFO : text8 downloaded\n", - "2017-09-19 17:46:24,560 :gensim.api :INFO : Extracting files from /home/chaitali/gensim-data/text8\n", - "2017-09-19 17:46:26,676 :gensim.api :INFO : text8 installed\n" + "2017-09-30 20:00:53,429 : INFO : Creating /home/chaitali/gensim-data/text8\n", + "2017-09-30 20:00:53,431 : INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", + "2017-09-30 20:00:53,433 : INFO : Downloading text8\n", + "2017-09-30 20:04:46,938 : INFO : text8 downloaded\n", + "2017-09-30 20:04:46,951 : INFO : Extracting files from /home/chaitali/gensim-data/text8\n", + "2017-09-30 20:04:48,888 : INFO : text8 installed\n" ] } ], "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", "corpus = api.load('text8')" ] }, @@ -57,134 +59,124 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:46:39,030 :gensim.models.word2vec :INFO : collecting all words and their counts\n", - "2017-09-19 17:46:39,037 :gensim.models.word2vec :INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-19 17:46:46,104 :gensim.models.word2vec :INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", - "2017-09-19 17:46:46,105 :gensim.models.word2vec :INFO : Loading a fresh vocabulary\n", - "2017-09-19 17:46:46,393 :gensim.models.word2vec :INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", - "2017-09-19 17:46:46,394 :gensim.models.word2vec :INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", - "2017-09-19 17:46:46,607 :gensim.models.word2vec :INFO : deleting the raw counts dictionary of 253854 items\n", - "2017-09-19 17:46:46,618 :gensim.models.word2vec :INFO : sample=0.001 downsamples 38 most-common words\n", - "2017-09-19 17:46:46,620 :gensim.models.word2vec :INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", - "2017-09-19 17:46:46,621 :gensim.models.word2vec :INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", - "2017-09-19 17:46:46,946 :gensim.models.word2vec :INFO : resetting layer weights\n", - "2017-09-19 17:46:48,052 :gensim.models.word2vec :INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-19 17:46:49,058 :gensim.models.word2vec :INFO : PROGRESS: at 1.14% examples, 707464 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:50,070 :gensim.models.word2vec :INFO : PROGRESS: at 2.33% examples, 716418 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:51,081 :gensim.models.word2vec :INFO : PROGRESS: at 3.50% examples, 720064 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:52,084 :gensim.models.word2vec :INFO : PROGRESS: at 4.68% examples, 724069 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:53,087 :gensim.models.word2vec :INFO : PROGRESS: at 5.83% examples, 724165 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:54,090 :gensim.models.word2vec :INFO : PROGRESS: at 7.00% examples, 726206 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:46:55,094 :gensim.models.word2vec :INFO : PROGRESS: at 8.15% examples, 725286 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:56,096 :gensim.models.word2vec :INFO : PROGRESS: at 9.30% examples, 724925 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:46:57,114 :gensim.models.word2vec :INFO : PROGRESS: at 10.18% examples, 704674 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:58,130 :gensim.models.word2vec :INFO : PROGRESS: at 11.37% examples, 707549 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:59,136 :gensim.models.word2vec :INFO : PROGRESS: at 12.30% examples, 696048 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:00,140 :gensim.models.word2vec :INFO : PROGRESS: at 13.47% examples, 699081 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:01,150 :gensim.models.word2vec :INFO : PROGRESS: at 14.63% examples, 700492 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:02,151 :gensim.models.word2vec :INFO : PROGRESS: at 15.79% examples, 701182 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:03,156 :gensim.models.word2vec :INFO : PROGRESS: at 16.95% examples, 702555 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:04,161 :gensim.models.word2vec :INFO : PROGRESS: at 18.13% examples, 704501 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:05,162 :gensim.models.word2vec :INFO : PROGRESS: at 19.04% examples, 696025 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:06,176 :gensim.models.word2vec :INFO : PROGRESS: at 19.96% examples, 689054 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:07,183 :gensim.models.word2vec :INFO : PROGRESS: at 21.15% examples, 691295 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:08,185 :gensim.models.word2vec :INFO : PROGRESS: at 22.30% examples, 692243 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:09,192 :gensim.models.word2vec :INFO : PROGRESS: at 23.47% examples, 693696 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:10,211 :gensim.models.word2vec :INFO : PROGRESS: at 24.64% examples, 695112 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:11,219 :gensim.models.word2vec :INFO : PROGRESS: at 25.77% examples, 695601 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:12,237 :gensim.models.word2vec :INFO : PROGRESS: at 26.90% examples, 695914 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:13,241 :gensim.models.word2vec :INFO : PROGRESS: at 28.09% examples, 697735 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:14,260 :gensim.models.word2vec :INFO : PROGRESS: at 29.25% examples, 698509 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:15,260 :gensim.models.word2vec :INFO : PROGRESS: at 30.36% examples, 698320 words/s, in_qsize 5, out_qsize 2\n", - "2017-09-19 17:47:16,273 :gensim.models.word2vec :INFO : PROGRESS: at 31.15% examples, 690867 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:17,293 :gensim.models.word2vec :INFO : PROGRESS: at 31.72% examples, 679085 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:47:18,311 :gensim.models.word2vec :INFO : PROGRESS: at 32.31% examples, 668474 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-19 17:47:19,318 :gensim.models.word2vec :INFO : PROGRESS: at 32.98% examples, 660311 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:20,320 :gensim.models.word2vec :INFO : PROGRESS: at 34.05% examples, 660626 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:21,328 :gensim.models.word2vec :INFO : PROGRESS: at 35.19% examples, 661837 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:22,350 :gensim.models.word2vec :INFO : PROGRESS: at 36.01% examples, 656869 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:23,358 :gensim.models.word2vec :INFO : PROGRESS: at 36.74% examples, 651028 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:24,365 :gensim.models.word2vec :INFO : PROGRESS: at 37.44% examples, 644886 words/s, in_qsize 3, out_qsize 1\n", - "2017-09-19 17:47:25,381 :gensim.models.word2vec :INFO : PROGRESS: at 38.11% examples, 638630 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:26,384 :gensim.models.word2vec :INFO : PROGRESS: at 38.73% examples, 631930 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-19 17:47:27,386 :gensim.models.word2vec :INFO : PROGRESS: at 39.56% examples, 629025 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:28,403 :gensim.models.word2vec :INFO : PROGRESS: at 40.40% examples, 626116 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:29,412 :gensim.models.word2vec :INFO : PROGRESS: at 41.48% examples, 626962 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:30,422 :gensim.models.word2vec :INFO : PROGRESS: at 42.68% examples, 629634 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:31,439 :gensim.models.word2vec :INFO : PROGRESS: at 43.62% examples, 628434 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:32,451 :gensim.models.word2vec :INFO : PROGRESS: at 44.30% examples, 623710 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:33,494 :gensim.models.word2vec :INFO : PROGRESS: at 45.24% examples, 622439 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:34,500 :gensim.models.word2vec :INFO : PROGRESS: at 46.14% examples, 621141 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:35,501 :gensim.models.word2vec :INFO : PROGRESS: at 47.16% examples, 621680 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:36,502 :gensim.models.word2vec :INFO : PROGRESS: at 47.96% examples, 619190 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:37,507 :gensim.models.word2vec :INFO : PROGRESS: at 49.01% examples, 619861 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:38,516 :gensim.models.word2vec :INFO : PROGRESS: at 49.65% examples, 615540 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:39,538 :gensim.models.word2vec :INFO : PROGRESS: at 50.32% examples, 611462 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:40,554 :gensim.models.word2vec :INFO : PROGRESS: at 51.06% examples, 608521 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:41,559 :gensim.models.word2vec :INFO : PROGRESS: at 52.02% examples, 608259 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:42,575 :gensim.models.word2vec :INFO : PROGRESS: at 52.82% examples, 606121 words/s, in_qsize 5, out_qsize 1\n" + "2017-09-30 20:04:59,672 : INFO : collecting all words and their counts\n", + "2017-09-30 20:04:59,677 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-09-30 20:05:06,425 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-09-30 20:05:06,426 : INFO : Loading a fresh vocabulary\n", + "2017-09-30 20:05:06,711 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-09-30 20:05:06,711 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-09-30 20:05:06,925 : INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-09-30 20:05:06,935 : INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-09-30 20:05:06,936 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-09-30 20:05:06,938 : INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-09-30 20:05:07,267 : INFO : resetting layer weights\n", + "2017-09-30 20:05:08,240 : INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-09-30 20:05:09,250 : INFO : PROGRESS: at 1.09% examples, 677048 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:10,260 : INFO : PROGRESS: at 2.22% examples, 682662 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:11,266 : INFO : PROGRESS: at 3.35% examples, 688976 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:12,281 : INFO : PROGRESS: at 4.47% examples, 688776 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:13,283 : INFO : PROGRESS: at 5.58% examples, 692300 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:14,291 : INFO : PROGRESS: at 6.71% examples, 695154 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:15,298 : INFO : PROGRESS: at 7.83% examples, 695477 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:16,302 : INFO : PROGRESS: at 8.94% examples, 694913 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:17,320 : INFO : PROGRESS: at 10.04% examples, 693561 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:18,334 : INFO : PROGRESS: at 11.17% examples, 694297 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:19,349 : INFO : PROGRESS: at 12.28% examples, 693175 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:20,357 : INFO : PROGRESS: at 13.39% examples, 693211 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:21,369 : INFO : PROGRESS: at 14.52% examples, 693794 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:05:22,377 : INFO : PROGRESS: at 15.57% examples, 689525 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:23,377 : INFO : PROGRESS: at 16.68% examples, 689973 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:24,391 : INFO : PROGRESS: at 17.72% examples, 686717 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:25,398 : INFO : PROGRESS: at 18.86% examples, 687798 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:26,409 : INFO : PROGRESS: at 20.00% examples, 688486 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:27,411 : INFO : PROGRESS: at 21.15% examples, 689967 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:28,424 : INFO : PROGRESS: at 22.27% examples, 689469 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:29,432 : INFO : PROGRESS: at 23.41% examples, 690341 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:30,440 : INFO : PROGRESS: at 24.53% examples, 690470 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:31,454 : INFO : PROGRESS: at 25.64% examples, 690757 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:32,460 : INFO : PROGRESS: at 26.76% examples, 691272 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:33,468 : INFO : PROGRESS: at 27.89% examples, 691773 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:34,481 : INFO : PROGRESS: at 29.02% examples, 692058 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:35,491 : INFO : PROGRESS: at 30.14% examples, 692208 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:05:36,501 : INFO : PROGRESS: at 31.23% examples, 691812 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:37,507 : INFO : PROGRESS: at 32.35% examples, 691913 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:38,527 : INFO : PROGRESS: at 33.47% examples, 691941 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:39,534 : INFO : PROGRESS: at 34.59% examples, 692104 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:40,535 : INFO : PROGRESS: at 35.73% examples, 692232 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:41,542 : INFO : PROGRESS: at 36.83% examples, 691803 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:42,549 : INFO : PROGRESS: at 37.95% examples, 692107 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:43,561 : INFO : PROGRESS: at 39.08% examples, 692095 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:44,572 : INFO : PROGRESS: at 40.21% examples, 692168 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:45,574 : INFO : PROGRESS: at 41.35% examples, 692460 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:46,586 : INFO : PROGRESS: at 42.46% examples, 692126 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:47,597 : INFO : PROGRESS: at 43.61% examples, 692641 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:48,599 : INFO : PROGRESS: at 44.71% examples, 692672 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:49,609 : INFO : PROGRESS: at 45.81% examples, 692414 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:50,613 : INFO : PROGRESS: at 46.89% examples, 692196 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:51,614 : INFO : PROGRESS: at 47.98% examples, 692082 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:52,615 : INFO : PROGRESS: at 49.08% examples, 691919 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:53,622 : INFO : PROGRESS: at 50.18% examples, 691886 words/s, in_qsize 5, out_qsize 2\n", + "2017-09-30 20:05:54,631 : INFO : PROGRESS: at 51.30% examples, 691985 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:55,631 : INFO : PROGRESS: at 52.39% examples, 691836 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:56,644 : INFO : PROGRESS: at 53.50% examples, 691643 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:57,650 : INFO : PROGRESS: at 54.61% examples, 691789 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:58,651 : INFO : PROGRESS: at 55.72% examples, 691433 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:59,651 : INFO : PROGRESS: at 56.81% examples, 691258 words/s, in_qsize 2, out_qsize 1\n", + "2017-09-30 20:06:00,662 : INFO : PROGRESS: at 57.93% examples, 691247 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:01,666 : INFO : PROGRESS: at 59.05% examples, 691236 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:02,674 : INFO : PROGRESS: at 60.16% examples, 691214 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:03,692 : INFO : PROGRESS: at 61.31% examples, 691243 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:04,693 : INFO : PROGRESS: at 62.41% examples, 691156 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:05,706 : INFO : PROGRESS: at 63.54% examples, 691252 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:06,726 : INFO : PROGRESS: at 64.66% examples, 691169 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:07,742 : INFO : PROGRESS: at 65.77% examples, 691212 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:08,747 : INFO : PROGRESS: at 66.88% examples, 691305 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:09,752 : INFO : PROGRESS: at 67.98% examples, 691292 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:10,761 : INFO : PROGRESS: at 69.09% examples, 691231 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:11,767 : INFO : PROGRESS: at 70.21% examples, 691345 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:12,776 : INFO : PROGRESS: at 71.30% examples, 691192 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:13,779 : INFO : PROGRESS: at 72.39% examples, 691076 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:14,783 : INFO : PROGRESS: at 73.50% examples, 691043 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:15,799 : INFO : PROGRESS: at 74.60% examples, 690947 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:16,804 : INFO : PROGRESS: at 75.72% examples, 690751 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:17,823 : INFO : PROGRESS: at 76.83% examples, 690552 words/s, in_qsize 5, out_qsize 0\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:47:43,592 :gensim.models.word2vec :INFO : PROGRESS: at 53.52% examples, 602970 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:44,647 :gensim.models.word2vec :INFO : PROGRESS: at 54.27% examples, 600118 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:47:45,650 :gensim.models.word2vec :INFO : PROGRESS: at 55.16% examples, 599088 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:46,652 :gensim.models.word2vec :INFO : PROGRESS: at 56.32% examples, 601102 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-19 17:47:47,668 :gensim.models.word2vec :INFO : PROGRESS: at 57.50% examples, 603159 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:48,671 :gensim.models.word2vec :INFO : PROGRESS: at 58.66% examples, 605176 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-19 17:47:49,677 :gensim.models.word2vec :INFO : PROGRESS: at 59.84% examples, 607140 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:50,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.02% examples, 609189 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:51,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.96% examples, 608709 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:52,688 :gensim.models.word2vec :INFO : PROGRESS: at 62.94% examples, 608671 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:53,697 :gensim.models.word2vec :INFO : PROGRESS: at 63.97% examples, 609169 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:54,707 :gensim.models.word2vec :INFO : PROGRESS: at 65.16% examples, 611179 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:55,708 :gensim.models.word2vec :INFO : PROGRESS: at 66.31% examples, 612923 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:56,709 :gensim.models.word2vec :INFO : PROGRESS: at 67.30% examples, 613086 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:57,718 :gensim.models.word2vec :INFO : PROGRESS: at 68.48% examples, 614770 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:58,725 :gensim.models.word2vec :INFO : PROGRESS: at 69.42% examples, 614374 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:59,730 :gensim.models.word2vec :INFO : PROGRESS: at 70.35% examples, 613877 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:00,753 :gensim.models.word2vec :INFO : PROGRESS: at 71.09% examples, 611666 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:01,767 :gensim.models.word2vec :INFO : PROGRESS: at 72.06% examples, 611557 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:02,774 :gensim.models.word2vec :INFO : PROGRESS: at 72.92% examples, 610505 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:03,778 :gensim.models.word2vec :INFO : PROGRESS: at 73.80% examples, 609723 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:04,784 :gensim.models.word2vec :INFO : PROGRESS: at 74.81% examples, 610019 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:05,787 :gensim.models.word2vec :INFO : PROGRESS: at 75.87% examples, 610421 words/s, in_qsize 3, out_qsize 2\n", - "2017-09-19 17:48:06,795 :gensim.models.word2vec :INFO : PROGRESS: at 76.87% examples, 610533 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:07,808 :gensim.models.word2vec :INFO : PROGRESS: at 78.06% examples, 612109 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:08,815 :gensim.models.word2vec :INFO : PROGRESS: at 79.07% examples, 612233 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:09,823 :gensim.models.word2vec :INFO : PROGRESS: at 80.27% examples, 613807 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:10,833 :gensim.models.word2vec :INFO : PROGRESS: at 81.26% examples, 613696 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:11,842 :gensim.models.word2vec :INFO : PROGRESS: at 82.45% examples, 615110 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:12,851 :gensim.models.word2vec :INFO : PROGRESS: at 83.61% examples, 616375 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:48:13,853 :gensim.models.word2vec :INFO : PROGRESS: at 84.77% examples, 617712 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:14,879 :gensim.models.word2vec :INFO : PROGRESS: at 85.67% examples, 616893 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:15,890 :gensim.models.word2vec :INFO : PROGRESS: at 86.64% examples, 616837 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:16,906 :gensim.models.word2vec :INFO : PROGRESS: at 87.69% examples, 617193 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:17,915 :gensim.models.word2vec :INFO : PROGRESS: at 88.79% examples, 617970 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:18,917 :gensim.models.word2vec :INFO : PROGRESS: at 89.78% examples, 618000 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:19,918 :gensim.models.word2vec :INFO : PROGRESS: at 90.91% examples, 618999 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:20,922 :gensim.models.word2vec :INFO : PROGRESS: at 91.96% examples, 619363 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:21,936 :gensim.models.word2vec :INFO : PROGRESS: at 92.85% examples, 618640 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:22,943 :gensim.models.word2vec :INFO : PROGRESS: at 93.84% examples, 618597 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:23,958 :gensim.models.word2vec :INFO : PROGRESS: at 94.85% examples, 618687 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:24,974 :gensim.models.word2vec :INFO : PROGRESS: at 95.90% examples, 618756 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:25,978 :gensim.models.word2vec :INFO : PROGRESS: at 96.93% examples, 619000 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:26,980 :gensim.models.word2vec :INFO : PROGRESS: at 97.93% examples, 619074 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:27,982 :gensim.models.word2vec :INFO : PROGRESS: at 98.99% examples, 619431 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:28,823 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-19 17:48:28,830 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-19 17:48:28,836 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-19 17:48:28,837 :gensim.models.word2vec :INFO : training on 85026035 raw words (62526300 effective words) took 100.8s, 620401 effective words/s\n" + "2017-09-30 20:06:18,828 : INFO : PROGRESS: at 77.95% examples, 690734 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:19,838 : INFO : PROGRESS: at 79.07% examples, 690668 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:20,845 : INFO : PROGRESS: at 80.20% examples, 690773 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:21,848 : INFO : PROGRESS: at 81.33% examples, 690841 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:22,853 : INFO : PROGRESS: at 82.43% examples, 690748 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:23,861 : INFO : PROGRESS: at 83.55% examples, 690766 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:24,868 : INFO : PROGRESS: at 84.67% examples, 690835 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:25,870 : INFO : PROGRESS: at 85.77% examples, 690917 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:26,876 : INFO : PROGRESS: at 86.88% examples, 690986 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:27,879 : INFO : PROGRESS: at 88.00% examples, 691091 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:28,896 : INFO : PROGRESS: at 89.12% examples, 691157 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-30 20:06:29,900 : INFO : PROGRESS: at 90.22% examples, 691067 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:30,901 : INFO : PROGRESS: at 91.31% examples, 691025 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:31,903 : INFO : PROGRESS: at 92.42% examples, 691030 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:32,915 : INFO : PROGRESS: at 93.52% examples, 690949 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:33,927 : INFO : PROGRESS: at 94.64% examples, 690994 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:06:34,932 : INFO : PROGRESS: at 95.76% examples, 690838 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-30 20:06:35,949 : INFO : PROGRESS: at 96.86% examples, 690691 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:36,955 : INFO : PROGRESS: at 97.99% examples, 690835 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:37,971 : INFO : PROGRESS: at 99.13% examples, 690908 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:38,752 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-09-30 20:06:38,756 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-09-30 20:06:38,759 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-09-30 20:06:38,760 : INFO : training on 85026035 raw words (62532240 effective words) took 90.5s, 690828 effective words/s\n" ] } ], @@ -202,32 +194,32 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:48:45,837 :gensim.models.keyedvectors :INFO : precomputing L2-norms of word weight vectors\n" + "2017-09-30 20:08:12,509 : INFO : precomputing L2-norms of word weight vectors\n" ] }, { "data": { "text/plain": [ - "[('leaf', 0.7284336090087891),\n", - " ('trees', 0.7024068236351013),\n", - " ('bark', 0.6984879970550537),\n", - " ('fruit', 0.623538613319397),\n", - " ('flower', 0.6177238821983337),\n", - " ('nest', 0.6133654713630676),\n", - " ('garden', 0.5962027311325073),\n", - " ('avl', 0.5909914374351501),\n", - " ('cave', 0.5902420282363892),\n", - " ('pond', 0.5827507972717285)]" + "[('trees', 0.7073001861572266),\n", + " ('bark', 0.7032904028892517),\n", + " ('leaf', 0.6881209015846252),\n", + " ('bird', 0.6044381260871887),\n", + " ('flower', 0.6009336709976196),\n", + " ('fruit', 0.597153902053833),\n", + " ('avl', 0.5837888717651367),\n", + " ('cactus', 0.5712562799453735),\n", + " ('bee', 0.5658263564109802),\n", + " ('garden', 0.565678596496582)]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -245,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -253,75 +245,73 @@ "output_type": "stream", "text": [ "{\n", - " \"gensim\": {\n", - " \"model\": {\n", - " \"Google_News_word2vec\": {\n", - " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", - " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", - " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", - " },\n", - " \"fasttext_eng_model\": {\n", - " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", - " \"filename\": \"wiki.en.vec\",\n", - " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", - " },\n", - " \"glove_common_crawl_42B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.42B.300d.zip\",\n", - " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", - " },\n", - " \"glove_common_crawl_840B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.840B.300d.zip\",\n", - " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", - " },\n", - " \"glove_wiki_gigaword_300d\": {\n", - " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.300d.txt\",\n", - " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", - " },\n", - " \"glove_wiki_gigaword_200d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.200d.txt\",\n", - " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", - " },\n", - " \"glove_wiki_gigaword_100d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.100d.txt\",\n", - " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", - " },\n", - " \"glove_wiki_gigaword_50d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.50d.txt\",\n", - " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", - " },\n", - " \"glove_twitter_200d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.200d.txt\",\n", - " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", - " },\n", - " \"glove_twitter_100d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.100d.txt\",\n", - " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", - " },\n", - " \"glove_twitter_50d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.50d.txt\",\n", - " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", - " },\n", - " \"glove_twitter_25d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.25d.txt\",\n", - " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", - " }\n", + " \"model\": {\n", + " \"Google_News_word2vec\": {\n", + " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", + " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", + " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", + " },\n", + " \"fasttext_eng_model\": {\n", + " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", + " \"filename\": \"wiki.en.vec\",\n", + " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", + " },\n", + " \"glove_common_crawl_42B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.42B.300d.zip\",\n", + " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", " },\n", - " \"corpus\": {\n", - " \"text8\": {\n", - " \"desc\": \"Wikipedia English corpus\",\n", - " \"filename\": \"text8\",\n", - " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", - " }\n", + " \"glove_common_crawl_840B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.840B.300d.zip\",\n", + " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", + " },\n", + " \"glove_wiki_gigaword_300d\": {\n", + " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.300d.txt\",\n", + " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", + " },\n", + " \"glove_wiki_gigaword_200d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.200d.txt\",\n", + " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", + " },\n", + " \"glove_wiki_gigaword_100d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.100d.txt\",\n", + " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", + " },\n", + " \"glove_wiki_gigaword_50d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.50d.txt\",\n", + " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", + " },\n", + " \"glove_twitter_200d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.200d.txt\",\n", + " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", + " },\n", + " \"glove_twitter_100d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.100d.txt\",\n", + " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", + " },\n", + " \"glove_twitter_50d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.50d.txt\",\n", + " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", + " },\n", + " \"glove_twitter_25d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.25d.txt\",\n", + " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", + " }\n", + " },\n", + " \"corpus\": {\n", + " \"text8\": {\n", + " \"desc\": \"Wikipedia English corpus\",\n", + " \"filename\": \"text8\",\n", + " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", " }\n", " }\n", "}\n" @@ -330,8 +320,8 @@ ], "source": [ "import json\n", - "dataset_list = api.info()\n", - "print(json.dumps(dataset_list, indent=4))" + "data_list = api.info()\n", + "print(json.dumps(data_list, indent=4))" ] }, { @@ -343,20 +333,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2017-09-19 21:56:42,071 :gensim.api :INFO : This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. \n", - "\n" + "This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\n" ] } ], "source": [ - "api.info('glove_common_crawl_42B')" + "glove_common_crawl_info = api.info('glove_common_crawl_42B')\n", + "print(glove_common_crawl_info)" ] }, { @@ -368,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "collapsed": true }, diff --git a/gensim/downloader.py b/gensim/downloader.py index 8edd3062b6..61fb294f00 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -19,27 +19,47 @@ user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') -data_log_file_dir = os.path.join(base_dir, 'data.json') - -logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - stream=sys.stdout, level=logging.INFO) +data_log_file_path = os.path.join(base_dir, 'data.json') logger = logging.getLogger('gensim.api') -if not os.path.isdir(base_dir): - try: - logger.info("Creating %s", base_dir) - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) +def get_data_list(): + """Function getting the list of all datasets/models. + + Returns: + list: returns a list of datasets/models avalible for installation. + """ + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + data_names = [] + corpora = data['corpus'] + models = data['model'] + for corpus in corpora: + data_names.append(corpus) + for model in models: + data_names.append(model) + return data_names + + +def get_data_name(data_): + """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. + + Args: + data_(string): Name of the corpus/model. + + Returns: + data_: returns the name for dataset/model + """ + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/alternate_names.json" + response = urlopen(url) + alternate_names_json = response.read().decode("utf-8") + alternate_names_json = json.loads(alternate_names_json) + data_names = get_data_list() + for data_name in data_names: + alternate_data_names = alternate_names_json[data_name] + if data_ in alternate_data_names: + return data_name def initialize_data_log_file(): @@ -47,8 +67,8 @@ def initialize_data_log_file(): for each corpus/model and stores in the log file. For eg: {"name": "text8", "status" : "None"} """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] + corpora = data['corpus'] + models = data['model'] json_list = [] for corpus in corpora: json_object = {"name": corpus, "status": "None"} @@ -56,37 +76,72 @@ def initialize_data_log_file(): for model in models: json_object = {"name": model, "status": "None"} json_list.append(json_object) - json.dump(json_list, data_log_file) - data_log_file.close() + + with open(data_log_file_path,'w') as f: + f.write(json.dumps(json_list)) + + +def create_files(): + """Function for creating the directory for storing corpora and models, and to create a json log file. + """ + if not os.path.isdir(base_dir): + try: + logger.info("Creating %s", base_dir) + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) + + if not os.path.isfile(data_log_file_path): + try: + logger.warning("Creating %s", data_log_file_path) + with open(data_log_file_path, 'w+'): + pass + initialize_data_log_file() + except: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the file manually" + .format(data_log_file_path)) -def update_data_log_file(dataset, status): - """Function for updating the status of the dataset json object. +def update_data_log_file(data_, status): + """Function for updating the status of the data_ json object. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. status(string): Status to be updates to i.e downloaded or installed. """ - jdata = json.loads(open(data_log_file_dir).read()) - for json_object in jdata: - if json_object["name"] == dataset: - json_object["status"] = status - with open(data_log_file_dir, 'w') as f: + with open(data_log_file_path, 'r') as f: + jdata = json.load(f) + for json_object in jdata: + if json_object["name"] == data_: + json_object["status"] = status + with open(data_log_file_path, 'w+') as f: f.write(json.dumps(jdata)) -def get_data_status(dataset): - """Function for finding the status of the dataset. + +def get_data_status(data_): + """Function for finding the status of the data_. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: returns the current status of the corpus/model i.e None, downloaded or installed. """ - jdata = json.loads(open(data_log_file_dir).read()) + with open(data_log_file_path, 'r') as f: + jdata = json.load(f) for json_object in jdata: - if json_object["name"] == dataset: + if json_object["name"] == data_: return json_object["status"] @@ -108,27 +163,30 @@ def calculate_md5_checksum(folder_dir): return hash_md5.hexdigest() -def info(dataset=None): - """Function for retrieving the list of corpora/models, if dataset is not provided. If dataset - is provided, then it gives detailed information about the dataset. +def info(data_=None): + """Function for retrieving the list of corpora/models, if data name is not provided. If data name + is provided, then it gives detailed information about the data. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: : It returns the models/corpora names with detailed information about each. """ - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if dataset is not None: - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - logger.info("%s \n", data['gensim']['corpus'][dataset]["desc"]) - elif dataset in models: - logger.info("%s \n", data['gensim']['model'][dataset]["desc"]) + if data_ is not None: + data_ = get_data_name(data_) + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + logger.info("%s \n", data['corpus'][data_]["desc"]) + return data['corpus'][data_]["desc"] + elif data_ in models: + logger.info("%s \n", data['model'][data_]["desc"]) + return data['model'][data_]["desc"] else: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " @@ -137,55 +195,44 @@ def info(dataset=None): return data -def get_checksum(dataset): +def get_checksum(data_): """Function for retrieving the checksum of a corpus/model Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: It returns the checksum for corresponding the corpus/model. """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - return data['gensim']['corpus'][dataset]["checksum"] - elif dataset in models: - return data['gensim']['model'][dataset]["checksum"] - - -if not os.path.isfile(data_log_file_dir): - try: - logger.warning("Creating %s", data_log_file_dir) - data_log_file = open(data_log_file_dir, 'a') - initialize_data_log_file() - except: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the file manually" - .format(data_log_file_dir)) + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + return data['corpus'][data_]["checksum"] + elif data_ in models: + return data['model'][data_]["checksum"] + -def _download(dataset): - """Function for downloading and installed dataset depending upon it's current status. +def _download(data_): + """Function for downloading and installed corpus/model depending upon it's current status. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. """ - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) - data_folder_dir = os.path.join(base_dir, dataset) + url = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=data_) + data_folder_dir = os.path.join(base_dir, data_) data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset not in corpora and dataset not in models: + corpora = data['corpus'] + models = data['model'] + if data_ not in corpora and data_ not in models: raise Exception( "Incorect Model/corpus name. Use info() or" " python -m gensim.downloader -c to get a list of models/corpora" " available.") - compressed_folder_name = "{f}.tar.gz".format(f=dataset) + compressed_folder_name = "{f}.tar.gz".format(f=data_) compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - if get_data_status(dataset) != "downloaded": + if get_data_status(data_) != "downloaded": if not os.path.exists(data_folder_dir): logger.info("Creating %s", data_folder_dir) os.makedirs(data_folder_dir) @@ -196,103 +243,106 @@ def _download(dataset): "Not able to create {a}. Make sure you have the correct read/" "write permissions for {b} or you can try creating it manually". format(a=data_folder_dir, b=base_dir)) - logger.info("Downloading %s", dataset) + logger.info("Downloading %s", data_) urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(dataset) + data_url = data_links(data_) if data_url is not None: index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", dataset) - update_data_log_file(dataset, status="downloaded") - if get_data_status(dataset) != "installed": + logger.info("%s downloaded", data_) + update_data_log_file(data_, status="downloaded") + if get_data_status(data_) != "installed": tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - if calculate_md5_checksum(data_folder_dir) == get_checksum(dataset): - update_data_log_file(dataset, status="installed") - logger.info("%s installed", dataset) + if calculate_md5_checksum(data_folder_dir) == get_checksum(data_): + update_data_log_file(data_, status="installed") + logger.info("%s installed", data_) else: logger.error("There was a problem in installing the file. Retrying.") - _download(dataset) + _download(data_) -def get_filename(dataset): +def get_filename(data_): """Function of retrieving the filename of corpus/model. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: Returns the filename of the model/corpus. """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - return data['gensim']['corpus'][dataset]["filename"] - elif dataset in models: - return data['gensim']['model'][dataset]["filename"] + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + return data['corpus'][data_]["filename"] + elif data_ in models: + return data['model'][data_]["filename"] -def load(dataset, return_path=False): +def load(data_, return_path=False): """Loads the corpus/model to the memory, if return_path is False. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. return_path(bool): Determines whether to return model/corpus file path. Returns: string: Returns the path to the model/corpus, if return_path is True. """ - file_name = get_filename(dataset) + data_ = get_data_name(data_) + create_files() + file_name = get_filename(data_) if file_name is None: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " "\n {}".format(json.dumps(info(), indent=4))) - folder_dir = os.path.join(base_dir, dataset) + folder_dir = os.path.join(base_dir, data_) file_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir) or get_data_status(dataset) != "installed": - _download(dataset) + if not os.path.exists(folder_dir) or get_data_status(data_) != "installed": + _download(data_) if return_path: return file_dir else: sys.path.insert(0, base_dir) - module = __import__(dataset) + module = __import__(data_) data = module.load_data() return data -def data_links(dataset): +def data_links(data_): """Function for retrieving the links of the models/corpus which are not stored in github releases Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: Returns the link of the model/corpus. """ - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/links.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if dataset in data['data_links']: - return data['data_links'][dataset]['link'] + if data_ in data['data_links']: + return data['data_links'][data_]['link'] if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO) + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="data__name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") args = parser.parse_args() if args.download is not None: data_path = load(args.download[0], return_path=True) logger.info("Data has been installed and data path is %s", data_path) elif args.info is not None: - info(dataset=args.info[0]) + info(data_=args.info[0]) elif args.catalogue is not None: data = info() logger.info("%s\n", json.dumps(data, indent=4)) From 498b32b2f175925189154fe4937eb95ddb7d17e4 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 3 Oct 2017 20:10:36 +0530 Subject: [PATCH 14/24] corrected formatting --- gensim/downloader.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 61fb294f00..2c9297baa4 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -22,6 +22,7 @@ data_log_file_path = os.path.join(base_dir, 'data.json') logger = logging.getLogger('gensim.api') + def get_data_list(): """Function getting the list of all datasets/models. @@ -43,7 +44,7 @@ def get_data_list(): def get_data_name(data_): - """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. + """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. Args: data_(string): Name of the corpus/model. @@ -77,12 +78,12 @@ def initialize_data_log_file(): json_object = {"name": model, "status": "None"} json_list.append(json_object) - with open(data_log_file_path,'w') as f: + with open(data_log_file_path, 'w') as f: f.write(json.dumps(json_list)) - + def create_files(): - """Function for creating the directory for storing corpora and models, and to create a json log file. + """Function for creating the directory for storing corpora and models, and to create a json log file. """ if not os.path.isdir(base_dir): try: @@ -119,7 +120,7 @@ def update_data_log_file(data_, status): data_(string): Name of the corpus/model. status(string): Status to be updates to i.e downloaded or installed. """ - with open(data_log_file_path, 'r') as f: + with open(data_log_file_path, 'r') as f: jdata = json.load(f) for json_object in jdata: if json_object["name"] == data_: @@ -128,8 +129,7 @@ def update_data_log_file(data_, status): f.write(json.dumps(jdata)) - -def get_data_status(data_): +def get_data_status(data_): """Function for finding the status of the data_. Args: @@ -163,7 +163,7 @@ def calculate_md5_checksum(folder_dir): return hash_md5.hexdigest() -def info(data_=None): +def info(data_=None): """Function for retrieving the list of corpora/models, if data name is not provided. If data name is provided, then it gives detailed information about the data. @@ -213,7 +213,6 @@ def get_checksum(data_): return data['model'][data_]["checksum"] - def _download(data_): """Function for downloading and installed corpus/model depending upon it's current status. @@ -293,7 +292,7 @@ def load(data_, return_path=False): Returns: string: Returns the path to the model/corpus, if return_path is True. """ - data_ = get_data_name(data_) + data_ = get_data_name(data_) create_files() file_name = get_filename(data_) if file_name is None: From 03649b00973b112e54726c7db9f59d835c394b04 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Thu, 5 Oct 2017 16:47:00 +0530 Subject: [PATCH 15/24] added checksum after download --- gensim/downloader.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 2c9297baa4..579e93e51f 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -145,14 +145,15 @@ def get_data_status(data_): return json_object["status"] -def calculate_md5_checksum(folder_dir): - """Function for calculating checksum of a downloaded model/corpus. +def calculate_md5_checksum(folder_dir, tar_file=None): + """Function for calculating checksum of a downloaded or installed model/corpus. Args: - folder_dir(string): Path to the downloaded model. + folder_dir(string): Path to the model/corpus folder.(contains model/corpus if proxied) + tar_file(string): Path to the dowloaded tar file. Tar file contains __init__.py file and the model/corpus(if it is stored in github releases) Returns: - string: It returns the value for the checksum for folder_dir directory + string: It returns the value for the checksum for folder_dir directory and the tar file. """ hash_md5 = hashlib.md5() for filename in os.listdir(folder_dir): @@ -160,6 +161,10 @@ def calculate_md5_checksum(folder_dir): with open(file_dir, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) + if tar_file is not None: + with open(tar_file, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) return hash_md5.hexdigest() @@ -195,7 +200,7 @@ def info(data_=None): return data -def get_checksum(data_): +def get_checksum(data_, status): """Function for retrieving the checksum of a corpus/model Args: @@ -204,13 +209,14 @@ def get_checksum(data_): Returns: string: It returns the checksum for corresponding the corpus/model. """ + key = "checksum_after_" + status data = info() corpora = data['corpus'] models = data['model'] if data_ in corpora: - return data['corpus'][data_]["checksum"] + return data['corpus'][data_][key] elif data_ in models: - return data['model'][data_]["checksum"] + return data['model'][data_][key] def _download(data_): @@ -249,18 +255,23 @@ def _download(data_): index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", data_) - update_data_log_file(data_, status="downloaded") + if calculate_md5_checksum(data_folder_dir, compressed_folder_dir) == get_checksum(data_, "download"): + logger.info("%s downloaded", data_) + update_data_log_file(data_, status="downloaded") + else: + logger.error("There was a problem in downloading the data. Retrying.") + _download(data_) + if get_data_status(data_) != "installed": tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - if calculate_md5_checksum(data_folder_dir) == get_checksum(data_): + if calculate_md5_checksum(data_folder_dir) == get_checksum(data_, "installation"): update_data_log_file(data_, status="installed") logger.info("%s installed", data_) else: - logger.error("There was a problem in installing the file. Retrying.") + logger.error("There was a problem in installing the dataset/model. Retrying.") _download(data_) @@ -303,6 +314,7 @@ def load(data_, return_path=False): file_dir = os.path.join(folder_dir, file_name) if not os.path.exists(folder_dir) or get_data_status(data_) != "installed": _download(data_) + if return_path: return file_dir else: @@ -330,7 +342,7 @@ def data_links(data_): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO) + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") group = parser.add_mutually_exclusive_group() group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") From 7fbf228d04bd3e548d6bdaceba0ebf28ce0c963f Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 17 Oct 2017 14:14:05 +0530 Subject: [PATCH 16/24] refactored code --- docs/notebooks/API_tutorial.ipynb | 392 ------------------------------ gensim/downloader.py | 335 ++++++++----------------- 2 files changed, 99 insertions(+), 628 deletions(-) delete mode 100644 docs/notebooks/API_tutorial.ipynb diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb deleted file mode 100644 index 5b9d28c59b..0000000000 --- a/docs/notebooks/API_tutorial.ipynb +++ /dev/null @@ -1,392 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tutorial for using Gensim's API for downloading corpuses/models\n", - "Let's start by importing the api module." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import gensim.downloader as api" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, lets download the text8 corpus and load it." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-30 20:00:53,429 : INFO : Creating /home/chaitali/gensim-data/text8\n", - "2017-09-30 20:00:53,431 : INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", - "2017-09-30 20:00:53,433 : INFO : Downloading text8\n", - "2017-09-30 20:04:46,938 : INFO : text8 downloaded\n", - "2017-09-30 20:04:46,951 : INFO : Extracting files from /home/chaitali/gensim-data/text8\n", - "2017-09-30 20:04:48,888 : INFO : text8 installed\n" - ] - } - ], - "source": [ - "import logging\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "corpus = api.load('text8')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As the corpus has been installed, let's create a word2vec model of our corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-30 20:04:59,672 : INFO : collecting all words and their counts\n", - "2017-09-30 20:04:59,677 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-30 20:05:06,425 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", - "2017-09-30 20:05:06,426 : INFO : Loading a fresh vocabulary\n", - "2017-09-30 20:05:06,711 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", - "2017-09-30 20:05:06,711 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", - "2017-09-30 20:05:06,925 : INFO : deleting the raw counts dictionary of 253854 items\n", - "2017-09-30 20:05:06,935 : INFO : sample=0.001 downsamples 38 most-common words\n", - "2017-09-30 20:05:06,936 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", - "2017-09-30 20:05:06,938 : INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", - "2017-09-30 20:05:07,267 : INFO : resetting layer weights\n", - "2017-09-30 20:05:08,240 : INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-30 20:05:09,250 : INFO : PROGRESS: at 1.09% examples, 677048 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:10,260 : INFO : PROGRESS: at 2.22% examples, 682662 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:05:11,266 : INFO : PROGRESS: at 3.35% examples, 688976 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:12,281 : INFO : PROGRESS: at 4.47% examples, 688776 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:13,283 : INFO : PROGRESS: at 5.58% examples, 692300 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:14,291 : INFO : PROGRESS: at 6.71% examples, 695154 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:15,298 : INFO : PROGRESS: at 7.83% examples, 695477 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:16,302 : INFO : PROGRESS: at 8.94% examples, 694913 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:17,320 : INFO : PROGRESS: at 10.04% examples, 693561 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:18,334 : INFO : PROGRESS: at 11.17% examples, 694297 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:19,349 : INFO : PROGRESS: at 12.28% examples, 693175 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:20,357 : INFO : PROGRESS: at 13.39% examples, 693211 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:21,369 : INFO : PROGRESS: at 14.52% examples, 693794 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-30 20:05:22,377 : INFO : PROGRESS: at 15.57% examples, 689525 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:23,377 : INFO : PROGRESS: at 16.68% examples, 689973 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:24,391 : INFO : PROGRESS: at 17.72% examples, 686717 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:25,398 : INFO : PROGRESS: at 18.86% examples, 687798 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:05:26,409 : INFO : PROGRESS: at 20.00% examples, 688486 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:27,411 : INFO : PROGRESS: at 21.15% examples, 689967 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:28,424 : INFO : PROGRESS: at 22.27% examples, 689469 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:29,432 : INFO : PROGRESS: at 23.41% examples, 690341 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:30,440 : INFO : PROGRESS: at 24.53% examples, 690470 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:31,454 : INFO : PROGRESS: at 25.64% examples, 690757 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:32,460 : INFO : PROGRESS: at 26.76% examples, 691272 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:33,468 : INFO : PROGRESS: at 27.89% examples, 691773 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:34,481 : INFO : PROGRESS: at 29.02% examples, 692058 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:35,491 : INFO : PROGRESS: at 30.14% examples, 692208 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-30 20:05:36,501 : INFO : PROGRESS: at 31.23% examples, 691812 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:37,507 : INFO : PROGRESS: at 32.35% examples, 691913 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:38,527 : INFO : PROGRESS: at 33.47% examples, 691941 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:05:39,534 : INFO : PROGRESS: at 34.59% examples, 692104 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:40,535 : INFO : PROGRESS: at 35.73% examples, 692232 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:41,542 : INFO : PROGRESS: at 36.83% examples, 691803 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:42,549 : INFO : PROGRESS: at 37.95% examples, 692107 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:43,561 : INFO : PROGRESS: at 39.08% examples, 692095 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:44,572 : INFO : PROGRESS: at 40.21% examples, 692168 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:45,574 : INFO : PROGRESS: at 41.35% examples, 692460 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:46,586 : INFO : PROGRESS: at 42.46% examples, 692126 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:47,597 : INFO : PROGRESS: at 43.61% examples, 692641 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:48,599 : INFO : PROGRESS: at 44.71% examples, 692672 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:49,609 : INFO : PROGRESS: at 45.81% examples, 692414 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:50,613 : INFO : PROGRESS: at 46.89% examples, 692196 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:05:51,614 : INFO : PROGRESS: at 47.98% examples, 692082 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:05:52,615 : INFO : PROGRESS: at 49.08% examples, 691919 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:05:53,622 : INFO : PROGRESS: at 50.18% examples, 691886 words/s, in_qsize 5, out_qsize 2\n", - "2017-09-30 20:05:54,631 : INFO : PROGRESS: at 51.30% examples, 691985 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:55,631 : INFO : PROGRESS: at 52.39% examples, 691836 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:56,644 : INFO : PROGRESS: at 53.50% examples, 691643 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:57,650 : INFO : PROGRESS: at 54.61% examples, 691789 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:58,651 : INFO : PROGRESS: at 55.72% examples, 691433 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:05:59,651 : INFO : PROGRESS: at 56.81% examples, 691258 words/s, in_qsize 2, out_qsize 1\n", - "2017-09-30 20:06:00,662 : INFO : PROGRESS: at 57.93% examples, 691247 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:01,666 : INFO : PROGRESS: at 59.05% examples, 691236 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:02,674 : INFO : PROGRESS: at 60.16% examples, 691214 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:03,692 : INFO : PROGRESS: at 61.31% examples, 691243 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:06:04,693 : INFO : PROGRESS: at 62.41% examples, 691156 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:05,706 : INFO : PROGRESS: at 63.54% examples, 691252 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:06,726 : INFO : PROGRESS: at 64.66% examples, 691169 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:07,742 : INFO : PROGRESS: at 65.77% examples, 691212 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:08,747 : INFO : PROGRESS: at 66.88% examples, 691305 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:09,752 : INFO : PROGRESS: at 67.98% examples, 691292 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:10,761 : INFO : PROGRESS: at 69.09% examples, 691231 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:06:11,767 : INFO : PROGRESS: at 70.21% examples, 691345 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:12,776 : INFO : PROGRESS: at 71.30% examples, 691192 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:13,779 : INFO : PROGRESS: at 72.39% examples, 691076 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:06:14,783 : INFO : PROGRESS: at 73.50% examples, 691043 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:15,799 : INFO : PROGRESS: at 74.60% examples, 690947 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:16,804 : INFO : PROGRESS: at 75.72% examples, 690751 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:17,823 : INFO : PROGRESS: at 76.83% examples, 690552 words/s, in_qsize 5, out_qsize 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-30 20:06:18,828 : INFO : PROGRESS: at 77.95% examples, 690734 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:19,838 : INFO : PROGRESS: at 79.07% examples, 690668 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:20,845 : INFO : PROGRESS: at 80.20% examples, 690773 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:21,848 : INFO : PROGRESS: at 81.33% examples, 690841 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:22,853 : INFO : PROGRESS: at 82.43% examples, 690748 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:23,861 : INFO : PROGRESS: at 83.55% examples, 690766 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:24,868 : INFO : PROGRESS: at 84.67% examples, 690835 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-30 20:06:25,870 : INFO : PROGRESS: at 85.77% examples, 690917 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:26,876 : INFO : PROGRESS: at 86.88% examples, 690986 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-30 20:06:27,879 : INFO : PROGRESS: at 88.00% examples, 691091 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:28,896 : INFO : PROGRESS: at 89.12% examples, 691157 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-30 20:06:29,900 : INFO : PROGRESS: at 90.22% examples, 691067 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:30,901 : INFO : PROGRESS: at 91.31% examples, 691025 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:31,903 : INFO : PROGRESS: at 92.42% examples, 691030 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:32,915 : INFO : PROGRESS: at 93.52% examples, 690949 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:33,927 : INFO : PROGRESS: at 94.64% examples, 690994 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-30 20:06:34,932 : INFO : PROGRESS: at 95.76% examples, 690838 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-30 20:06:35,949 : INFO : PROGRESS: at 96.86% examples, 690691 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:36,955 : INFO : PROGRESS: at 97.99% examples, 690835 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:37,971 : INFO : PROGRESS: at 99.13% examples, 690908 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-30 20:06:38,752 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-30 20:06:38,756 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-30 20:06:38,759 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-30 20:06:38,760 : INFO : training on 85026035 raw words (62532240 effective words) took 90.5s, 690828 effective words/s\n" - ] - } - ], - "source": [ - "from gensim.models.word2vec import Word2Vec\n", - "model = Word2Vec(corpus)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have our word2vec model, let's find words that are similar to 'tree'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-09-30 20:08:12,509 : INFO : precomputing L2-norms of word weight vectors\n" - ] - }, - { - "data": { - "text/plain": [ - "[('trees', 0.7073001861572266),\n", - " ('bark', 0.7032904028892517),\n", - " ('leaf', 0.6881209015846252),\n", - " ('bird', 0.6044381260871887),\n", - " ('flower', 0.6009336709976196),\n", - " ('fruit', 0.597153902053833),\n", - " ('avl', 0.5837888717651367),\n", - " ('cactus', 0.5712562799453735),\n", - " ('bee', 0.5658263564109802),\n", - " ('garden', 0.565678596496582)]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.most_similar('tree')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use the API to download many corpuses and models. You can get the list of all the models and corpuses that are provided, by using the code below:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"model\": {\n", - " \"Google_News_word2vec\": {\n", - " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", - " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", - " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", - " },\n", - " \"fasttext_eng_model\": {\n", - " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", - " \"filename\": \"wiki.en.vec\",\n", - " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", - " },\n", - " \"glove_common_crawl_42B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.42B.300d.zip\",\n", - " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", - " },\n", - " \"glove_common_crawl_840B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.840B.300d.zip\",\n", - " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", - " },\n", - " \"glove_wiki_gigaword_300d\": {\n", - " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.300d.txt\",\n", - " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", - " },\n", - " \"glove_wiki_gigaword_200d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.200d.txt\",\n", - " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", - " },\n", - " \"glove_wiki_gigaword_100d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.100d.txt\",\n", - " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", - " },\n", - " \"glove_wiki_gigaword_50d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.50d.txt\",\n", - " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", - " },\n", - " \"glove_twitter_200d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.200d.txt\",\n", - " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", - " },\n", - " \"glove_twitter_100d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.100d.txt\",\n", - " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", - " },\n", - " \"glove_twitter_50d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.50d.txt\",\n", - " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", - " },\n", - " \"glove_twitter_25d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.25d.txt\",\n", - " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", - " }\n", - " },\n", - " \"corpus\": {\n", - " \"text8\": {\n", - " \"desc\": \"Wikipedia English corpus\",\n", - " \"filename\": \"text8\",\n", - " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", - " }\n", - " }\n", - "}\n" - ] - } - ], - "source": [ - "import json\n", - "data_list = api.info()\n", - "print(json.dumps(data_list, indent=4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to get detailed information about the model/corpus, use:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\n" - ] - } - ], - "source": [ - "glove_common_crawl_info = api.info('glove_common_crawl_42B')\n", - "print(glove_common_crawl_info)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sometimes, you do not want to load the corpus/model to memory. You would just want to get the path to the corpus/model. For that, use :" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "text8_path = api.load('text8', return_path=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/gensim/downloader.py b/gensim/downloader.py index 579e93e51f..a70ffa3ea9 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -7,6 +7,8 @@ import sys import errno import hashlib +from shutil import rmtree +import tempfile try: import urllib.request as urllib except ImportError: @@ -23,53 +25,10 @@ logger = logging.getLogger('gensim.api') -def get_data_list(): - """Function getting the list of all datasets/models. - - Returns: - list: returns a list of datasets/models avalible for installation. - """ - url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - data_names = [] - corpora = data['corpus'] - models = data['model'] - for corpus in corpora: - data_names.append(corpus) - for model in models: - data_names.append(model) - return data_names - - -def get_data_name(data_): - """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. - - Args: - data_(string): Name of the corpus/model. - - Returns: - data_: returns the name for dataset/model - """ - url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/alternate_names.json" - response = urlopen(url) - alternate_names_json = response.read().decode("utf-8") - alternate_names_json = json.loads(alternate_names_json) - data_names = get_data_list() - for data_name in data_names: - alternate_data_names = alternate_names_json[data_name] - if data_ in alternate_data_names: - return data_name - - -def initialize_data_log_file(): - """Function for initializing the log file. Creates a json object - for each corpus/model and stores in the log file. For eg: {"name": "text8", "status" : "None"} - """ +def _initialize_data_log_file(): data = info() - corpora = data['corpus'] - models = data['model'] + corpora = data['corpora'] + models = data['models'] json_list = [] for corpus in corpora: json_object = {"name": corpus, "status": "None"} @@ -77,14 +36,11 @@ def initialize_data_log_file(): for model in models: json_object = {"name": model, "status": "None"} json_list.append(json_object) - with open(data_log_file_path, 'w') as f: f.write(json.dumps(json_list)) -def create_files(): - """Function for creating the directory for storing corpora and models, and to create a json log file. - """ +def _create_files(): if not os.path.isdir(base_dir): try: logger.info("Creating %s", base_dir) @@ -105,7 +61,7 @@ def create_files(): logger.warning("Creating %s", data_log_file_path) with open(data_log_file_path, 'w+'): pass - initialize_data_log_file() + _initialize_data_log_file() except: raise Exception( "Can't create {}. Make sure you have the read/write permissions " @@ -113,85 +69,46 @@ def create_files(): .format(data_log_file_path)) -def update_data_log_file(data_, status): - """Function for updating the status of the data_ json object. - - Args: - data_(string): Name of the corpus/model. - status(string): Status to be updates to i.e downloaded or installed. - """ +def _update_data_log_file(name): with open(data_log_file_path, 'r') as f: jdata = json.load(f) for json_object in jdata: - if json_object["name"] == data_: - json_object["status"] = status + if json_object["name"] == name: + json_object["status"] = "downloaded" with open(data_log_file_path, 'w+') as f: f.write(json.dumps(jdata)) -def get_data_status(data_): - """Function for finding the status of the data_. - - Args: - data_(string): Name of the corpus/model. - - Returns: - string: returns the current status of the corpus/model i.e None, downloaded or installed. - """ +def _get_data_status(name): with open(data_log_file_path, 'r') as f: jdata = json.load(f) for json_object in jdata: - if json_object["name"] == data_: + if json_object["name"] == name: return json_object["status"] -def calculate_md5_checksum(folder_dir, tar_file=None): - """Function for calculating checksum of a downloaded or installed model/corpus. - - Args: - folder_dir(string): Path to the model/corpus folder.(contains model/corpus if proxied) - tar_file(string): Path to the dowloaded tar file. Tar file contains __init__.py file and the model/corpus(if it is stored in github releases) - - Returns: - string: It returns the value for the checksum for folder_dir directory and the tar file. - """ +def _calculate_md5_checksum(tar_file): hash_md5 = hashlib.md5() - for filename in os.listdir(folder_dir): - file_dir = os.path.join(folder_dir, filename) - with open(file_dir, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - if tar_file is not None: - with open(tar_file, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) + with open(tar_file, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) return hash_md5.hexdigest() -def info(data_=None): - """Function for retrieving the list of corpora/models, if data name is not provided. If data name - is provided, then it gives detailed information about the data. - - Args: - data_(string): Name of the corpus/model. - - Returns: - : It returns the models/corpora names with detailed information about each. - """ - url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" +def info(name=None): + url = "https://raw.githubusercontent.com/chaitaliSaini/gensim-data/master/list.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if data_ is not None: - data_ = get_data_name(data_) - corpora = data['corpus'] - models = data['model'] - if data_ in corpora: - logger.info("%s \n", data['corpus'][data_]["desc"]) - return data['corpus'][data_]["desc"] - elif data_ in models: - logger.info("%s \n", data['model'][data_]["desc"]) - return data['model'][data_]["desc"] + if name is not None: + corpora = data['corpora'] + models = data['models'] + if name in corpora: + logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) + return data['corpora'][name] + elif name in models: + logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) + return data['models'][name] else: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " @@ -200,160 +117,106 @@ def info(data_=None): return data -def get_checksum(data_, status): - """Function for retrieving the checksum of a corpus/model - - Args: - data_(string): Name of the corpus/model. - - Returns: - string: It returns the checksum for corresponding the corpus/model. - """ - key = "checksum_after_" + status +def _get_checksum(name): data = info() - corpora = data['corpus'] - models = data['model'] - if data_ in corpora: - return data['corpus'][data_][key] - elif data_ in models: - return data['model'][data_][key] - - -def _download(data_): - """Function for downloading and installed corpus/model depending upon it's current status. - - Args: - data_(string): Name of the corpus/model. - """ - url = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=data_) - data_folder_dir = os.path.join(base_dir, data_) - data = info() - corpora = data['corpus'] - models = data['model'] - if data_ not in corpora and data_ not in models: + corpora = data['corpora'] + models = data['models'] + if name in corpora: + return data['corpora'][name]["checksum"] + elif name in models: + return data['models'][name]["checksum"] + + +def _download(name): + url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) + url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) + data_folder_dir = os.path.join(base_dir, name) + compressed_folder_name = "{f}.tar.gz".format(f=name) + if os.path.exists(data_folder_dir): + logger.info("%s already exists. Deleting it.", data_folder_dir) + rmtree(data_folder_dir) + logger.info("Creating %s", data_folder_dir) + os.makedirs(data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + else: raise Exception( - "Incorect Model/corpus name. Use info() or" - " python -m gensim.downloader -c to get a list of models/corpora" - " available.") - compressed_folder_name = "{f}.tar.gz".format(f=data_) - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - if get_data_status(data_) != "downloaded": - if not os.path.exists(data_folder_dir): - logger.info("Creating %s", data_folder_dir) - os.makedirs(data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - else: - raise Exception( - "Not able to create {a}. Make sure you have the correct read/" - "write permissions for {b} or you can try creating it manually". - format(a=data_folder_dir, b=base_dir)) - logger.info("Downloading %s", data_) - urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(data_) - if data_url is not None: - index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) - urllib.urlretrieve(data_url, data_dir) - if calculate_md5_checksum(data_folder_dir, compressed_folder_dir) == get_checksum(data_, "download"): - logger.info("%s downloaded", data_) - update_data_log_file(data_, status="downloaded") - else: - logger.error("There was a problem in downloading the data. Retrying.") - _download(data_) - - if get_data_status(data_) != "installed": - tar = tarfile.open(compressed_folder_dir) - logger.info("Extracting files from %s", data_folder_dir) - tar.extractall(data_folder_dir) - tar.close() - if calculate_md5_checksum(data_folder_dir) == get_checksum(data_, "installation"): - update_data_log_file(data_, status="installed") - logger.info("%s installed", data_) - else: - logger.error("There was a problem in installing the dataset/model. Retrying.") - _download(data_) - - -def get_filename(data_): - """Function of retrieving the filename of corpus/model. + "Not able to create {a}. Make sure you have the correct read/" + "write permissions for {b} or you can try creating it manually". + format(a=data_folder_dir, b=base_dir)) + + tmp_dir = tempfile.mkdtemp() + tmp_data_folder_dir = os.path.join(tmp_dir, name) + os.makedirs(tmp_data_folder_dir) + if not os.path.exists(tmp_data_folder_dir): + raise Exception( + "Not able to create data folder in {a}. Make sure you have the correct" + " read/write permissions for {a}".format(a=os.path.dirname(tmp_dir))) - Args: - data_(string): Name of the corpus/model. + tmp_load_file = os.path.join(tmp_data_folder_dir, "__init__.py") + urllib.urlretrieve(url_load_file, tmp_load_file) + logger.info("Downloading %s", name) + tmp_data_file = os.path.join(tmp_dir, compressed_folder_name) + urllib.urlretrieve(url_data, tmp_data_file) - Returns: - string: Returns the filename of the model/corpus. - """ + if _calculate_md5_checksum(tmp_data_file) == _get_checksum(name): + logger.info("%s downloaded", name) + else: + rmtree(tmp_dir) + raise Exception("There was a problem in downloading the data. We recommend you to re-try.") + tar = tarfile.open(tmp_data_file) + tar.extractall(tmp_data_folder_dir) + tar.close() + os.remove(tmp_data_file) + os.rename(tmp_data_folder_dir, data_folder_dir) + os.rmdir(tmp_dir) + _update_data_log_file(name) + + +def _get_filename(name): data = info() - corpora = data['corpus'] - models = data['model'] - if data_ in corpora: - return data['corpus'][data_]["filename"] - elif data_ in models: - return data['model'][data_]["filename"] - + corpora = data['corpora'] + models = data['models'] + if name in corpora: + return data['corpora'][name]["file_name"] + elif name in models: + return data['models'][name]["file_name"] -def load(data_, return_path=False): - """Loads the corpus/model to the memory, if return_path is False. - Args: - data_(string): Name of the corpus/model. - return_path(bool): Determines whether to return model/corpus file path. - - Returns: - string: Returns the path to the model/corpus, if return_path is True. - """ - data_ = get_data_name(data_) - create_files() - file_name = get_filename(data_) +def load(name, return_path=False): + _create_files() + file_name = _get_filename(name) if file_name is None: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " "\n {}".format(json.dumps(info(), indent=4))) - folder_dir = os.path.join(base_dir, data_) - file_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir) or get_data_status(data_) != "installed": - _download(data_) + folder_dir = os.path.join(base_dir, name) + data_dir = os.path.join(folder_dir, file_name) + if not os.path.exists(folder_dir) or _get_data_status(name) != "downloaded": + _download(name) if return_path: - return file_dir + return data_dir else: sys.path.insert(0, base_dir) - module = __import__(data_) + module = __import__(name) data = module.load_data() return data -def data_links(data_): - """Function for retrieving the links of the models/corpus which are not stored in github releases - - Args: - data_(string): Name of the corpus/model. - - Returns: - string: Returns the link of the model/corpus. - """ - url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/links.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if data_ in data['data_links']: - return data['data_links'][data_]['link'] - - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", metavar="data__name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") - group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") + group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim.downloader -d corpus/model name") + group.add_argument("-i", "--info", metavar="data__name", nargs=1, help="To get information about a corpus/model : python -m gensim.downloader -i model/corpus name") + group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim.downloader -c", action="store_true") args = parser.parse_args() if args.download is not None: data_path = load(args.download[0], return_path=True) logger.info("Data has been installed and data path is %s", data_path) elif args.info is not None: - info(data_=args.info[0]) + info(name=args.info[0]) elif args.catalogue is not None: data = info() logger.info("%s\n", json.dumps(data, indent=4)) From d0311d197bb97a0d9dd819b58e4c86be945afd53 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 17 Oct 2017 15:16:06 +0530 Subject: [PATCH 17/24] removed log file code --- gensim/downloader.py | 70 +++----------------------------------------- 1 file changed, 4 insertions(+), 66 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index a70ffa3ea9..54cd4b03b2 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import argparse -import json import os +import json import tarfile import logging import sys @@ -21,26 +21,10 @@ user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') -data_log_file_path = os.path.join(base_dir, 'data.json') logger = logging.getLogger('gensim.api') -def _initialize_data_log_file(): - data = info() - corpora = data['corpora'] - models = data['models'] - json_list = [] - for corpus in corpora: - json_object = {"name": corpus, "status": "None"} - json_list.append(json_object) - for model in models: - json_object = {"name": model, "status": "None"} - json_list.append(json_object) - with open(data_log_file_path, 'w') as f: - f.write(json.dumps(json_list)) - - -def _create_files(): +def _create_base_dir(): if not os.path.isdir(base_dir): try: logger.info("Creating %s", base_dir) @@ -56,36 +40,6 @@ def _create_files(): "to the directory or you can try creating the folder manually" .format(base_dir)) - if not os.path.isfile(data_log_file_path): - try: - logger.warning("Creating %s", data_log_file_path) - with open(data_log_file_path, 'w+'): - pass - _initialize_data_log_file() - except: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the file manually" - .format(data_log_file_path)) - - -def _update_data_log_file(name): - with open(data_log_file_path, 'r') as f: - jdata = json.load(f) - for json_object in jdata: - if json_object["name"] == name: - json_object["status"] = "downloaded" - with open(data_log_file_path, 'w+') as f: - f.write(json.dumps(jdata)) - - -def _get_data_status(name): - with open(data_log_file_path, 'r') as f: - jdata = json.load(f) - for json_object in jdata: - if json_object["name"] == name: - return json_object["status"] - def _calculate_md5_checksum(tar_file): hash_md5 = hashlib.md5() @@ -132,19 +86,6 @@ def _download(name): url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) data_folder_dir = os.path.join(base_dir, name) compressed_folder_name = "{f}.tar.gz".format(f=name) - if os.path.exists(data_folder_dir): - logger.info("%s already exists. Deleting it.", data_folder_dir) - rmtree(data_folder_dir) - logger.info("Creating %s", data_folder_dir) - os.makedirs(data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - else: - raise Exception( - "Not able to create {a}. Make sure you have the correct read/" - "write permissions for {b} or you can try creating it manually". - format(a=data_folder_dir, b=base_dir)) - tmp_dir = tempfile.mkdtemp() tmp_data_folder_dir = os.path.join(tmp_dir, name) os.makedirs(tmp_data_folder_dir) @@ -152,13 +93,11 @@ def _download(name): raise Exception( "Not able to create data folder in {a}. Make sure you have the correct" " read/write permissions for {a}".format(a=os.path.dirname(tmp_dir))) - tmp_load_file = os.path.join(tmp_data_folder_dir, "__init__.py") urllib.urlretrieve(url_load_file, tmp_load_file) logger.info("Downloading %s", name) tmp_data_file = os.path.join(tmp_dir, compressed_folder_name) urllib.urlretrieve(url_data, tmp_data_file) - if _calculate_md5_checksum(tmp_data_file) == _get_checksum(name): logger.info("%s downloaded", name) else: @@ -170,7 +109,6 @@ def _download(name): os.remove(tmp_data_file) os.rename(tmp_data_folder_dir, data_folder_dir) os.rmdir(tmp_dir) - _update_data_log_file(name) def _get_filename(name): @@ -184,7 +122,7 @@ def _get_filename(name): def load(name, return_path=False): - _create_files() + _create_base_dir() file_name = _get_filename(name) if file_name is None: raise Exception( @@ -192,7 +130,7 @@ def load(name, return_path=False): "\n {}".format(json.dumps(info(), indent=4))) folder_dir = os.path.join(base_dir, name) data_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir) or _get_data_status(name) != "downloaded": + if not os.path.exists(folder_dir): _download(name) if return_path: From 7e00e2d9323378b88ff224f0f7aada1428e0860e Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 25 Oct 2017 13:41:24 +0530 Subject: [PATCH 18/24] added progressbar --- docs/notebooks/API_tutorial.ipynb | 416 ++++++++++++++++++++++++++++++ gensim/downloader.py | 140 +++++++++- 2 files changed, 543 insertions(+), 13 deletions(-) create mode 100644 docs/notebooks/API_tutorial.ipynb diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb new file mode 100644 index 0000000000..2fc5f47b43 --- /dev/null +++ b/docs/notebooks/API_tutorial.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial for using Gensim's API for downloading corpuses/models\n", + "Let's start by importing the api module." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gensim.downloader as api" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, lets download the text8 corpus. For that, you have to use the load function. If the dataset has been already downloaded, then corpus path will be returned. Otherwise, dataset will be downloaded and then path will be returned." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-10-25 00:54:48,792 : INFO : Downloading text8\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[==================================================] 100.0% 31.7/31.7MB downloaded\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-10-25 00:55:11,503 : INFO : text8 downloaded\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to corpus: /home/chaitali/gensim-data/text8/text8\n" + ] + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "corpus_path = api.load('text8')\n", + "print(\"Path to corpus: \", corpus_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the corpus has been downloaded and extracted, let's create a word2vec model of our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-10-25 00:55:47,594 : INFO : collecting all words and their counts\n", + "2017-10-25 00:55:47,600 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-10-25 00:55:55,687 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-10-25 00:55:55,688 : INFO : Loading a fresh vocabulary\n", + "2017-10-25 00:55:55,968 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-10-25 00:55:55,969 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-10-25 00:55:56,172 : INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-10-25 00:55:56,181 : INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-10-25 00:55:56,181 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-10-25 00:55:56,182 : INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-10-25 00:55:56,474 : INFO : resetting layer weights\n", + "2017-10-25 00:55:57,350 : INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-10-25 00:55:58,358 : INFO : PROGRESS: at 1.19% examples, 736000 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:55:59,368 : INFO : PROGRESS: at 2.06% examples, 633832 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:00,373 : INFO : PROGRESS: at 3.25% examples, 667998 words/s, in_qsize 5, out_qsize 1\n", + "2017-10-25 00:56:01,375 : INFO : PROGRESS: at 4.44% examples, 688247 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:02,376 : INFO : PROGRESS: at 5.62% examples, 699262 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:03,376 : INFO : PROGRESS: at 6.81% examples, 708000 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:04,403 : INFO : PROGRESS: at 7.70% examples, 684594 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:05,407 : INFO : PROGRESS: at 8.34% examples, 648645 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:06,415 : INFO : PROGRESS: at 9.54% examples, 659722 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:07,427 : INFO : PROGRESS: at 10.72% examples, 667502 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:08,434 : INFO : PROGRESS: at 11.91% examples, 674170 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:09,441 : INFO : PROGRESS: at 13.12% examples, 680713 words/s, in_qsize 3, out_qsize 0\n", + "2017-10-25 00:56:10,443 : INFO : PROGRESS: at 14.30% examples, 685139 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:11,455 : INFO : PROGRESS: at 15.51% examples, 688569 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:12,455 : INFO : PROGRESS: at 16.71% examples, 692452 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:13,456 : INFO : PROGRESS: at 17.90% examples, 695587 words/s, in_qsize 4, out_qsize 0\n", + "2017-10-25 00:56:14,456 : INFO : PROGRESS: at 19.07% examples, 697658 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:15,460 : INFO : PROGRESS: at 20.27% examples, 700206 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:16,477 : INFO : PROGRESS: at 21.50% examples, 702899 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:17,495 : INFO : PROGRESS: at 22.69% examples, 704046 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:18,499 : INFO : PROGRESS: at 23.90% examples, 706366 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:19,502 : INFO : PROGRESS: at 25.08% examples, 707804 words/s, in_qsize 5, out_qsize 1\n", + "2017-10-25 00:56:20,514 : INFO : PROGRESS: at 26.27% examples, 709358 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:21,519 : INFO : PROGRESS: at 27.45% examples, 710918 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:22,536 : INFO : PROGRESS: at 28.67% examples, 712387 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:23,537 : INFO : PROGRESS: at 29.85% examples, 713647 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:24,537 : INFO : PROGRESS: at 31.02% examples, 714298 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:25,542 : INFO : PROGRESS: at 32.16% examples, 714256 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:26,551 : INFO : PROGRESS: at 33.33% examples, 714777 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:27,555 : INFO : PROGRESS: at 34.51% examples, 715477 words/s, in_qsize 6, out_qsize 0\n", + "2017-10-25 00:56:28,560 : INFO : PROGRESS: at 35.72% examples, 716204 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:29,571 : INFO : PROGRESS: at 36.88% examples, 716264 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:30,579 : INFO : PROGRESS: at 38.08% examples, 717179 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:31,581 : INFO : PROGRESS: at 39.24% examples, 717091 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:32,593 : INFO : PROGRESS: at 40.19% examples, 713299 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:33,593 : INFO : PROGRESS: at 40.87% examples, 705355 words/s, in_qsize 6, out_qsize 1\n", + "2017-10-25 00:56:34,596 : INFO : PROGRESS: at 41.92% examples, 703678 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:35,628 : INFO : PROGRESS: at 42.99% examples, 702102 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:36,633 : INFO : PROGRESS: at 43.97% examples, 699841 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:37,647 : INFO : PROGRESS: at 45.14% examples, 700441 words/s, in_qsize 5, out_qsize 1\n", + "2017-10-25 00:56:38,665 : INFO : PROGRESS: at 45.90% examples, 694902 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:39,667 : INFO : PROGRESS: at 46.57% examples, 688487 words/s, in_qsize 6, out_qsize 0\n", + "2017-10-25 00:56:40,673 : INFO : PROGRESS: at 47.30% examples, 683151 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:41,675 : INFO : PROGRESS: at 48.01% examples, 677667 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:42,688 : INFO : PROGRESS: at 49.16% examples, 678471 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:43,695 : INFO : PROGRESS: at 50.29% examples, 679025 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:44,708 : INFO : PROGRESS: at 51.49% examples, 680405 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:45,710 : INFO : PROGRESS: at 52.66% examples, 681577 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:46,724 : INFO : PROGRESS: at 53.85% examples, 682664 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:47,753 : INFO : PROGRESS: at 54.65% examples, 678686 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:48,768 : INFO : PROGRESS: at 55.43% examples, 674446 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:49,784 : INFO : PROGRESS: at 56.37% examples, 672526 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:50,791 : INFO : PROGRESS: at 57.52% examples, 673311 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:51,796 : INFO : PROGRESS: at 58.65% examples, 673821 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:52,801 : INFO : PROGRESS: at 59.78% examples, 674243 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:56:53,825 : INFO : PROGRESS: at 60.74% examples, 672693 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:54,829 : INFO : PROGRESS: at 61.89% examples, 673317 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:55,830 : INFO : PROGRESS: at 63.09% examples, 674568 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:56,837 : INFO : PROGRESS: at 64.24% examples, 675262 words/s, in_qsize 4, out_qsize 0\n", + "2017-10-25 00:56:57,838 : INFO : PROGRESS: at 65.33% examples, 675366 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:58,851 : INFO : PROGRESS: at 66.35% examples, 674781 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:56:59,864 : INFO : PROGRESS: at 67.03% examples, 670749 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:00,874 : INFO : PROGRESS: at 68.14% examples, 670995 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:57:01,877 : INFO : PROGRESS: at 69.29% examples, 671790 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:02,885 : INFO : PROGRESS: at 70.46% examples, 672715 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:03,887 : INFO : PROGRESS: at 71.63% examples, 673593 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:57:04,896 : INFO : PROGRESS: at 72.82% examples, 674561 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:05,902 : INFO : PROGRESS: at 74.02% examples, 675632 words/s, in_qsize 6, out_qsize 1\n", + "2017-10-25 00:57:06,903 : INFO : PROGRESS: at 75.17% examples, 676178 words/s, in_qsize 5, out_qsize 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-10-25 00:57:07,903 : INFO : PROGRESS: at 76.34% examples, 676871 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:08,911 : INFO : PROGRESS: at 77.54% examples, 677805 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:09,915 : INFO : PROGRESS: at 78.72% examples, 678531 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:10,923 : INFO : PROGRESS: at 79.89% examples, 679198 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:57:11,929 : INFO : PROGRESS: at 81.09% examples, 680040 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:12,933 : INFO : PROGRESS: at 82.29% examples, 680809 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:13,939 : INFO : PROGRESS: at 83.47% examples, 681462 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:14,960 : INFO : PROGRESS: at 84.68% examples, 682292 words/s, in_qsize 6, out_qsize 1\n", + "2017-10-25 00:57:15,970 : INFO : PROGRESS: at 85.88% examples, 683163 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:16,985 : INFO : PROGRESS: at 87.04% examples, 683714 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:17,992 : INFO : PROGRESS: at 88.25% examples, 684584 words/s, in_qsize 6, out_qsize 0\n", + "2017-10-25 00:57:19,000 : INFO : PROGRESS: at 89.41% examples, 685015 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:20,006 : INFO : PROGRESS: at 90.58% examples, 685623 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:21,012 : INFO : PROGRESS: at 91.75% examples, 686097 words/s, in_qsize 4, out_qsize 1\n", + "2017-10-25 00:57:22,015 : INFO : PROGRESS: at 92.89% examples, 686426 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:23,016 : INFO : PROGRESS: at 94.07% examples, 687092 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:24,019 : INFO : PROGRESS: at 95.27% examples, 687694 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:25,026 : INFO : PROGRESS: at 96.39% examples, 687666 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:26,035 : INFO : PROGRESS: at 97.50% examples, 687629 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:27,044 : INFO : PROGRESS: at 98.61% examples, 687673 words/s, in_qsize 5, out_qsize 0\n", + "2017-10-25 00:57:28,052 : INFO : PROGRESS: at 99.75% examples, 687836 words/s, in_qsize 6, out_qsize 0\n", + "2017-10-25 00:57:28,270 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-10-25 00:57:28,272 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-10-25 00:57:28,275 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-10-25 00:57:28,276 : INFO : training on 85026035 raw words (62539321 effective words) took 90.9s, 687833 effective words/s\n" + ] + } + ], + "source": [ + "from gensim.models.word2vec import Text8Corpus\n", + "from gensim.models.word2vec import Word2Vec\n", + "corpus = Text8Corpus(corpus_path)\n", + "model = Word2Vec(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our word2vec model, let's find words that are similar to 'tree'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-10-25 01:00:28,412 : INFO : precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "[('trees', 0.6827899217605591),\n", + " ('leaf', 0.6613928079605103),\n", + " ('avl', 0.6607560515403748),\n", + " ('bark', 0.6449544429779053),\n", + " ('bird', 0.6373804807662964),\n", + " ('flower', 0.6225261688232422),\n", + " ('fruit', 0.6008641719818115),\n", + " ('grass', 0.5877920985221863),\n", + " ('cave', 0.5793893933296204),\n", + " ('cactus', 0.5789991021156311)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar('tree')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"corpora\": {\n", + " \"text8\": {\n", + " \"description\": \"Cleaned small sample from wikipedia\",\n", + " \"checksum\": \"f407f5aed497fc3b0fb33b98c4f9d855\",\n", + " \"file_name\": \"text8\"\n", + " },\n", + " \"fake-news\": {\n", + " \"description\": \"It contains text and metadata scraped from 244 websites tagged as 'bullshit' here by the BS Detector Chrome Extension by Daniel Sieradski.\",\n", + " \"checksum\": \"a61f985190ba361defdfc3fef616b9cd\",\n", + " \"file_name\": \"fake.csv\",\n", + " \"source\": \"Kaggle\"\n", + " }\n", + " },\n", + " \"models\": {\n", + " \"glove-wiki-gigaword-50\": {\n", + " \"description\": \"Pre-trained vectors ,Wikipedia 2014 + Gigaword 5,6B tokens, 400K vocab, uncased. https://nlp.stanford.edu/projects/glove/\",\n", + " \"parameters\": \"dimension = 50\",\n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-wiki-gigaword-50.txt`\",\n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\",\n", + " \"checksum\": \"b269a9c8b16c3178d3d0651fd68c3fe9\",\n", + " \"file_name\": \"glove-wiki-gigaword-50.txt\"\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "data_list = api.info()\n", + "print(json.dumps(data_list, indent=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get detailed information about the model/corpus, use:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'description': \"It contains text and metadata scraped from 244 websites tagged as 'bullshit' here by the BS Detector Chrome Extension by Daniel Sieradski.\", 'checksum': 'a61f985190ba361defdfc3fef616b9cd', 'file_name': 'fake.csv', 'source': 'Kaggle'}\n" + ] + } + ], + "source": [ + "fake_news_info = api.info('fake-news')\n", + "print(fake_news_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, you do not want to load the model to memory. You would just want to get the path to the model. For that, use :" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/chaitali/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.txt\n" + ] + } + ], + "source": [ + "print(api.load('glove-wiki-gigaword-50', return_path=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to load the model to memory, then:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('plastic', 0.7942505478858948),\n", + " ('metal', 0.770871639251709),\n", + " ('walls', 0.7700636386871338),\n", + " ('marble', 0.7638523578643799),\n", + " ('wood', 0.7624280452728271),\n", + " ('ceramic', 0.7602593302726746),\n", + " ('pieces', 0.7589111924171448),\n", + " ('stained', 0.7528817057609558),\n", + " ('tile', 0.748193621635437),\n", + " ('furniture', 0.7463858723640442)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = api.load(\"glove-wiki-gigaword-50\")\n", + "model.most_similar(\"glass\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In corpora, the corpus is never loaded to memory. Always,(i.e it doesn't matter if return_path is True or false) the path to the dataset file/folder will be returned and then you will hve to load it yourself." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/downloader.py b/gensim/downloader.py index 54cd4b03b2..2b04801148 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -1,3 +1,4 @@ +"""This module is an API for downloading, getting information and loading datasets/models.""" from __future__ import absolute_import import argparse import os @@ -7,6 +8,7 @@ import sys import errno import hashlib +import math from shutil import rmtree import tempfile try: @@ -24,7 +26,41 @@ logger = logging.getLogger('gensim.api') +def progress(chunks_downloaded, chunk_size, total_size): + r"""Create and update the progress bar. + + Parameters + ---------- + chunks_downloaded : int + Number of chunks of data that have been downloaded + chunks_size : int + Size of each chunk of data. + total_size : int + Total size of the dataset/model + + References + ---------- + [1] https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 + + """ + bar_len = 50 + size_downloaded = float(chunks_downloaded * chunk_size) + filled_len = int(math.floor((bar_len*size_downloaded)/total_size)) + percent_downloaded = round((size_downloaded * 100)/total_size, 1) + bar = '=' * filled_len + '-' * (bar_len-filled_len) + sys.stdout.write('[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", round(size_downloaded/(1024*1024), 1), round(float(total_size)/(1024*1024), 1))) + sys.stdout.flush() + + def _create_base_dir(): + r"""Create the gensim-data directory in home directory, if it has not been already created. + Raises + ------ + File Exists Error + Give this exception when a file gensim-data already exists in the home directory. + Exception + An exception is raised when read/write permissions are not available + """ if not os.path.isdir(base_dir): try: logger.info("Creating %s", base_dir) @@ -42,6 +78,18 @@ def _create_base_dir(): def _calculate_md5_checksum(tar_file): + r"""Calculate the checksum of the given tar.gz file. + Parameters + ---------- + tar_file : str + Path to the tar.gz file that contains the downloaded dataset/model. + + Returns + ------- + str + Return the calculated checksum of the tsr.gz file. + + """ hash_md5 = hashlib.md5() with open(tar_file, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): @@ -50,6 +98,25 @@ def _calculate_md5_checksum(tar_file): def info(name=None): + r"""Return the information related to model/dataset. + + If name is supplied, then information related to the given dataset/model will be returned. Otherwise detailed information of all model/datasets will be returned. + + Parameters + ---------- + name : {None, data name}, optional + + Returns + ------- + dict + Return detailed information about all models/datasets if name is not provided. Otherwise return detailed informtiona of the specific model/dataset + + Raises + ------ + Exception + If name that has been passed is incorrect, an exception is raised. + + """ url = "https://raw.githubusercontent.com/chaitaliSaini/gensim-data/master/list.json" response = urlopen(url) data = response.read().decode("utf-8") @@ -72,6 +139,20 @@ def info(name=None): def _get_checksum(name): + r"""Retrieve the checksum of the model/dataset. + This is compared to the checksum of the downloaded model/dataset in _download function. + + Parameters + ---------- + name: str + dataset/model name + + Returns + ------- + str + retrieved checksum of dataset/model + + """ data = info() corpora = data['corpora'] models = data['models'] @@ -82,36 +163,53 @@ def _get_checksum(name): def _download(name): + r"""Download and extract the dataset/model + + Parameters + ---------- + name: str + dataset/model name which has to be downloaded + + Raises + ------ + Exception + If checksum of dowloaded data does not match the retrieved checksum, then downloaded has not been donw properly. + """ url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) data_folder_dir = os.path.join(base_dir, name) compressed_folder_name = "{f}.tar.gz".format(f=name) tmp_dir = tempfile.mkdtemp() - tmp_data_folder_dir = os.path.join(tmp_dir, name) - os.makedirs(tmp_data_folder_dir) - if not os.path.exists(tmp_data_folder_dir): - raise Exception( - "Not able to create data folder in {a}. Make sure you have the correct" - " read/write permissions for {a}".format(a=os.path.dirname(tmp_dir))) - tmp_load_file = os.path.join(tmp_data_folder_dir, "__init__.py") + tmp_load_file = os.path.join(tmp_dir, "__init__.py") urllib.urlretrieve(url_load_file, tmp_load_file) logger.info("Downloading %s", name) tmp_data_file = os.path.join(tmp_dir, compressed_folder_name) - urllib.urlretrieve(url_data, tmp_data_file) + urllib.urlretrieve(url_data, tmp_data_file, reporthook=progress) if _calculate_md5_checksum(tmp_data_file) == _get_checksum(name): logger.info("%s downloaded", name) else: rmtree(tmp_dir) raise Exception("There was a problem in downloading the data. We recommend you to re-try.") - tar = tarfile.open(tmp_data_file) - tar.extractall(tmp_data_folder_dir) - tar.close() + with tarfile.open(tmp_data_file, 'r') as tar: + tar.extractall(tmp_dir) os.remove(tmp_data_file) - os.rename(tmp_data_folder_dir, data_folder_dir) - os.rmdir(tmp_dir) + os.rename(tmp_dir, data_folder_dir) def _get_filename(name): + r"""Retrive the filename(or foldername, in case of some datasets) of the dataset/model. + + Parameters + ---------- + name: str + dataset/model of which filename is retrieved. + + Returns + ------- + str: + filename(or foldername, in case of some datasets) of the dataset/model. + + """ data = info() corpora = data['corpora'] models = data['models'] @@ -122,6 +220,22 @@ def _get_filename(name): def load(name, return_path=False): + r"""For models, if return_path is False, then load model to memory. Otherwise, return the path to the model. + For datasets, return path to the dataset in both cases. + + Parameters + ---------- + name: str + name of the model/dataset + return_path:{False, True}, optional + + Returns + ------- + data: + load model to memory + data_dir: str + return path of dataset/model. + """ _create_base_dir() file_name = _get_filename(name) if file_name is None: From f38670da1a30502424f03aa5c2da1065f98f72ca Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 25 Oct 2017 15:39:16 +0530 Subject: [PATCH 19/24] fixed pep8 --- gensim/downloader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 2b04801148..0b632f74fa 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -45,10 +45,10 @@ def progress(chunks_downloaded, chunk_size, total_size): """ bar_len = 50 size_downloaded = float(chunks_downloaded * chunk_size) - filled_len = int(math.floor((bar_len*size_downloaded)/total_size)) - percent_downloaded = round((size_downloaded * 100)/total_size, 1) - bar = '=' * filled_len + '-' * (bar_len-filled_len) - sys.stdout.write('[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", round(size_downloaded/(1024*1024), 1), round(float(total_size)/(1024*1024), 1))) + filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) + percent_downloaded = round((size_downloaded * 100) / total_size, 1) + bar = '=' * filled_len + '-' * (bar_len - filled_len) + sys.stdout.write('[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", round(size_downloaded / (1024 * 1024), 1), round(float(total_size) / (1024 * 1024), 1))) sys.stdout.flush() From 4cadfa2c0aefe181754e2065a476baaf881420f4 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 31 Oct 2017 12:59:19 +0530 Subject: [PATCH 20/24] added tests --- gensim/downloader.py | 67 +++++++++++++++++++++---------------- gensim/test/test_api.py | 74 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 29 deletions(-) create mode 100644 gensim/test/test_api.py diff --git a/gensim/downloader.py b/gensim/downloader.py index 0b632f74fa..b28069d6fc 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -26,8 +26,8 @@ logger = logging.getLogger('gensim.api') -def progress(chunks_downloaded, chunk_size, total_size): - r"""Create and update the progress bar. +def _progress(chunks_downloaded, chunk_size, total_size): + """Create and update the _progress bar. Parameters ---------- @@ -46,20 +46,24 @@ def progress(chunks_downloaded, chunk_size, total_size): bar_len = 50 size_downloaded = float(chunks_downloaded * chunk_size) filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) - percent_downloaded = round((size_downloaded * 100) / total_size, 1) + percent_downloaded = round(((size_downloaded * 100) / total_size), 1) bar = '=' * filled_len + '-' * (bar_len - filled_len) - sys.stdout.write('[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", round(size_downloaded / (1024 * 1024), 1), round(float(total_size) / (1024 * 1024), 1))) + sys.stdout.write( + '[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", + round(size_downloaded / (1024 * 1024), 1), + round(float(total_size) / (1024 * 1024), 1))) sys.stdout.flush() def _create_base_dir(): - r"""Create the gensim-data directory in home directory, if it has not been already created. + """Create the gensim-data directory in home directory, if it has not been already created. Raises ------ File Exists Error Give this exception when a file gensim-data already exists in the home directory. Exception An exception is raised when read/write permissions are not available + """ if not os.path.isdir(base_dir): try: @@ -78,7 +82,7 @@ def _create_base_dir(): def _calculate_md5_checksum(tar_file): - r"""Calculate the checksum of the given tar.gz file. + """Calculate the checksum of the given tar.gz file. Parameters ---------- tar_file : str @@ -98,18 +102,20 @@ def _calculate_md5_checksum(tar_file): def info(name=None): - r"""Return the information related to model/dataset. + """Return the information related to model/dataset. - If name is supplied, then information related to the given dataset/model will be returned. Otherwise detailed information of all model/datasets will be returned. + If name is supplied, then information related to the given dataset/model will + be returned. Otherwise detailed information of all model/datasets will be returned. Parameters ---------- - name : {None, data name}, optional - + name : str or None, optional + str stores the data name. Returns ------- dict - Return detailed information about all models/datasets if name is not provided. Otherwise return detailed informtiona of the specific model/dataset + Return detailed information about all models/datasets if name is not + provided. Otherwise return detailed informtiona of the specific model/dataset Raises ------ @@ -139,18 +145,18 @@ def info(name=None): def _get_checksum(name): - r"""Retrieve the checksum of the model/dataset. + """Retrieve the checksum of the model/dataset. This is compared to the checksum of the downloaded model/dataset in _download function. Parameters ---------- name: str - dataset/model name + Dataset/model name Returns ------- str - retrieved checksum of dataset/model + Retrieved checksum of dataset/model """ data = info() @@ -163,17 +169,19 @@ def _get_checksum(name): def _download(name): - r"""Download and extract the dataset/model + """Download and extract the dataset/model Parameters ---------- name: str - dataset/model name which has to be downloaded + Dataset/model name which has to be downloaded Raises ------ Exception - If checksum of dowloaded data does not match the retrieved checksum, then downloaded has not been donw properly. + If checksum of dowloaded data does not match the retrieved checksum, + then downloaded has not been donw properly. + """ url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) @@ -184,7 +192,7 @@ def _download(name): urllib.urlretrieve(url_load_file, tmp_load_file) logger.info("Downloading %s", name) tmp_data_file = os.path.join(tmp_dir, compressed_folder_name) - urllib.urlretrieve(url_data, tmp_data_file, reporthook=progress) + urllib.urlretrieve(url_data, tmp_data_file, reporthook=_progress) if _calculate_md5_checksum(tmp_data_file) == _get_checksum(name): logger.info("%s downloaded", name) else: @@ -197,17 +205,17 @@ def _download(name): def _get_filename(name): - r"""Retrive the filename(or foldername, in case of some datasets) of the dataset/model. + """Retrive the filename(or foldername, in case of some datasets) of the dataset/model. Parameters ---------- name: str - dataset/model of which filename is retrieved. + Dataset/model of which filename is retrieved. Returns ------- str: - filename(or foldername, in case of some datasets) of the dataset/model. + Filename(or foldername, in case of some datasets) of the dataset/model. """ data = info() @@ -220,21 +228,22 @@ def _get_filename(name): def load(name, return_path=False): - r"""For models, if return_path is False, then load model to memory. Otherwise, return the path to the model. + """For models, if return_path is False, then load model to memory. Otherwise, return the path to the model. For datasets, return path to the dataset in both cases. Parameters ---------- name: str - name of the model/dataset - return_path:{False, True}, optional + Name of the model/dataset + return_path: False or True, optional Returns ------- data: - load model to memory + Load model to memory data_dir: str - return path of dataset/model. + Return path of dataset/model. + """ _create_base_dir() file_name = _get_filename(name) @@ -258,10 +267,10 @@ def load(name, return_path=False): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) - parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data_name | -i data_name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim.downloader -d corpus/model name") - group.add_argument("-i", "--info", metavar="data__name", nargs=1, help="To get information about a corpus/model : python -m gensim.downloader -i model/corpus name") + group.add_argument("-d", "--download", metavar="data_name", nargs=1, help="To download a corpus/model : python -m gensim.downloader -d corpus/model name") + group.add_argument("-i", "--info", metavar="data_name", nargs=1, help="To get information about a corpus/model : python -m gensim.downloader -i model/corpus name") group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim.downloader -c", action="store_true") args = parser.parse_args() if args.download is not None: diff --git a/gensim/test/test_api.py b/gensim/test/test_api.py new file mode 100644 index 0000000000..a585dbf449 --- /dev/null +++ b/gensim/test/test_api.py @@ -0,0 +1,74 @@ +import logging +import unittest +import os +import gensim.downloader as api +import shutil +import numpy as np + + +class TestApi(unittest.TestCase): + + def test_base_dir_creation(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + api._create_base_dir() + self.assertTrue(os.path.isdir(base_dir)) + os.rmdir(base_dir) + + def test_load_dataset(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + data_dir = os.path.join(base_dir, "matrix-synopsis") + dataset_path = os.path.join(data_dir, "matrix-synopsis.txt") + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(api.load("matrix-synopsis"), dataset_path) + shutil.rmtree(base_dir) + + def test_return_path_dataset(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + data_dir = os.path.join(base_dir, "matrix-synopsis") + dataset_path = os.path.join(data_dir, "matrix-synopsis.txt") + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(dataset_path, api.load("matrix-synopsis", return_path=True)) + shutil.rmtree(base_dir) + + def test_load_model(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + vector_dead = np.array( + [0.17403787, -0.10167074, -0.00950371, -0.10367849, -0.14034484, + -0.08751217, 0.10030612, 0.07677923, -0.32563496, 0.01929072, + 0.20521086, -0.1617067, 0.00475458, 0.21956187, -0.08783089, + -0.05937332, 0.26528183, -0.06771874, -0.12369668, 0.12020949, + 0.28731, 0.36735833, 0.28051138, -0.10407482, 0.2496888, + -0.19372769, -0.28719661, 0.11989869, -0.00393865, -0.2431484, + 0.02725661, -0.20421691, 0.0328669, -0.26947051, -0.08068217, + -0.10245913, 0.1170633, 0.16583319, 0.1183883, -0.11217165, + 0.1261425, -0.0319365, -0.15787181, 0.03753783, 0.14748634, + 0.00414471, -0.02296237, 0.18336892, -0.23840059, 0.17924534]) + model = api.load("word2vec-matrix-synopsis") + vector_dead_calc = model["dead"] + self.assertTrue(np.allclose(vector_dead, vector_dead_calc)) + shutil.rmtree(base_dir) + + def test_return_path_model(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + data_dir = os.path.join(base_dir, "word2vec-matrix-synopsis") + model_path = os.path.join(data_dir, "word2vec-matrix-synopsis.txt") + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(model_path, api.load("word2vec-matrix-synopsis", return_path=True)) + shutil.rmtree(base_dir) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From e844e0173e9c12e9f0191c5f2b32b28714d231cf Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 7 Nov 2017 15:42:12 +0530 Subject: [PATCH 21/24] added download for >2gb data --- gensim/downloader.py | 100 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index b28069d6fc..4ecbc42f68 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -9,7 +9,7 @@ import errno import hashlib import math -from shutil import rmtree +import shutil import tempfile try: import urllib.request as urllib @@ -46,10 +46,10 @@ def _progress(chunks_downloaded, chunk_size, total_size): bar_len = 50 size_downloaded = float(chunks_downloaded * chunk_size) filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) - percent_downloaded = round(((size_downloaded * 100) / total_size), 1) + percent_downloaded = round(((size_downloaded * 100) / total_size), 1) bar = '=' * filled_len + '-' * (bar_len - filled_len) sys.stdout.write( - '[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", + '\r[%s] %s%s %s/%sMB downloaded' % (bar, percent_downloaded, "%", round(size_downloaded / (1024 * 1024), 1), round(float(total_size) / (1024 * 1024), 1))) sys.stdout.flush() @@ -144,7 +144,7 @@ def info(name=None): return data -def _get_checksum(name): +def _get_checksum(name, part=None): """Retrieve the checksum of the model/dataset. This is compared to the checksum of the downloaded model/dataset in _download function. @@ -158,14 +158,43 @@ def _get_checksum(name): str Retrieved checksum of dataset/model + """ + data = info() + corpora = data['corpora'] + models = data['models'] + if part is None: + if name in corpora: + return data['corpora'][name]["checksum"] + elif name in models: + return data['models'][name]["checksum"] + else: + if name in corpora: + return data['corpora'][name]["checksum-" + str(part)] + elif name in models: + return data['models'][name]["checksum-" + str(part)] + + +def _get_parts(name): + """Retrieve the number of parts in which dataset/model has been split. + + Parameters + ---------- + name: str + Dataset/model name + + Returns + ------- + str + Number of parts in which dataset/model has been split. + """ data = info() corpora = data['corpora'] models = data['models'] if name in corpora: - return data['corpora'][name]["checksum"] + return data['corpora'][name]["parts"] elif name in models: - return data['models'][name]["checksum"] + return data['models'][name]["parts"] def _download(name): @@ -183,25 +212,54 @@ def _download(name): then downloaded has not been donw properly. """ - url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) data_folder_dir = os.path.join(base_dir, name) - compressed_folder_name = "{f}.tar.gz".format(f=name) tmp_dir = tempfile.mkdtemp() - tmp_load_file = os.path.join(tmp_dir, "__init__.py") - urllib.urlretrieve(url_load_file, tmp_load_file) - logger.info("Downloading %s", name) - tmp_data_file = os.path.join(tmp_dir, compressed_folder_name) - urllib.urlretrieve(url_data, tmp_data_file, reporthook=_progress) - if _calculate_md5_checksum(tmp_data_file) == _get_checksum(name): - logger.info("%s downloaded", name) + tmp_load_file_path = os.path.join(tmp_dir, "__init__.py") + urllib.urlretrieve(url_load_file, tmp_load_file_path) + no_parts = int(_get_parts(name)) + if no_parts > 1: + concatenated_folder_name = "{f}.tar.gz".format(f=name) + concatenated_folder_dir = os.path.join(tmp_dir, concatenated_folder_name) + for part in range(1, no_parts + 1): + url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) + compressed_folder_name = "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) + tmp_data_file_dir = os.path.join(tmp_dir, compressed_folder_name) + logger.info("Downloading Part %s/%s", part, no_parts) + urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=_progress) + if _calculate_md5_checksum(tmp_data_file_dir) == _get_checksum(name, part): + sys.stdout.write("\n") + sys.stdout.flush() + logger.info("Part %s/%s downloaded", part, no_parts) + else: + shutil.rmtree(tmp_dir) + raise Exception("There was a problem in downloading the data. We recommend you to re-try.") + with open(concatenated_folder_dir, 'wb') as wfp: + for part in range(1, no_parts + 1): + part_path = os.path.join(tmp_dir, "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part))) + with open(part_path, "rb") as rfp: + shutil.copyfileobj(rfp, wfp) + os.remove(part_path) + with tarfile.open(concatenated_folder_dir, 'r') as tar: + tar.extractall(tmp_dir) + os.remove(concatenated_folder_dir) + os.rename(tmp_dir, data_folder_dir) else: - rmtree(tmp_dir) - raise Exception("There was a problem in downloading the data. We recommend you to re-try.") - with tarfile.open(tmp_data_file, 'r') as tar: - tar.extractall(tmp_dir) - os.remove(tmp_data_file) - os.rename(tmp_dir, data_folder_dir) + url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) + compressed_folder_name = "{f}.tar.gz".format(f=name) + tmp_data_file_dir = os.path.join(tmp_dir, compressed_folder_name) + urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=_progress) + if _calculate_md5_checksum(tmp_data_file_dir) == _get_checksum(name): + sys.stdout.write("\n") + sys.stdout.flush() + logger.info("%s downloaded", name) + else: + shutil.rmtree(tmp_dir) + raise Exception("There was a problem in downloading the data. We recommend you to re-try.") + with tarfile.open(tmp_data_file_dir, 'r') as tar: + tar.extractall(tmp_dir) + os.remove(tmp_data_file_dir) + os.rename(tmp_dir, data_folder_dir) def _get_filename(name): From 580a93ae65352c900651d97c81bff3ee9e3a1061 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 8 Nov 2017 17:45:08 +0530 Subject: [PATCH 22/24] add test for multipart --- gensim/downloader.py | 94 ++++++++++++++++++++--------------------- gensim/test/test_api.py | 23 +++++++--- 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 4ecbc42f68..7f0452ef42 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -11,32 +11,31 @@ import math import shutil import tempfile -try: - import urllib.request as urllib -except ImportError: - import urllib +from functools import partial -try: - from urllib.request import urlopen -except ImportError: +if sys.version_info[0] == 2: + import urllib from urllib2 import urlopen +else: + import urllib.request as urllib + from urllib.request import urlopen user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') logger = logging.getLogger('gensim.api') -def _progress(chunks_downloaded, chunk_size, total_size): +def _progress(chunks_downloaded, chunk_size, total_size, part=1, no_parts=1): """Create and update the _progress bar. Parameters ---------- chunks_downloaded : int - Number of chunks of data that have been downloaded + Number of chunks of data that have been downloaded. chunks_size : int Size of each chunk of data. total_size : int - Total size of the dataset/model + Total size of the dataset/model. References ---------- @@ -48,21 +47,28 @@ def _progress(chunks_downloaded, chunk_size, total_size): filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) percent_downloaded = round(((size_downloaded * 100) / total_size), 1) bar = '=' * filled_len + '-' * (bar_len - filled_len) - sys.stdout.write( - '\r[%s] %s%s %s/%sMB downloaded' % (bar, percent_downloaded, "%", - round(size_downloaded / (1024 * 1024), 1), - round(float(total_size) / (1024 * 1024), 1))) - sys.stdout.flush() + if no_parts == 1: + sys.stdout.write( + '\r[%s] %s%s %s/%sMB downloaded' % (bar, percent_downloaded, "%", + round(size_downloaded / (1024 * 1024), 1), + round(float(total_size) / (1024 * 1024), 1))) + sys.stdout.flush() + else: + sys.stdout.write( + '\r Part %s/%s [%s] %s%s %s/%sMB downloaded' % (part + 1, no_parts, bar, percent_downloaded, "%", + round(size_downloaded / (1024 * 1024), 1), + round(float(total_size) / (1024 * 1024), 1))) + sys.stdout.flush() def _create_base_dir(): """Create the gensim-data directory in home directory, if it has not been already created. + Raises ------ - File Exists Error - Give this exception when a file gensim-data already exists in the home directory. Exception - An exception is raised when read/write permissions are not available + An exception is raised when read/write permissions are not available or a file named + gensim-data already exists in the home directory. """ if not os.path.isdir(base_dir): @@ -83,6 +89,7 @@ def _create_base_dir(): def _calculate_md5_checksum(tar_file): """Calculate the checksum of the given tar.gz file. + Parameters ---------- tar_file : str @@ -115,7 +122,7 @@ def info(name=None): ------- dict Return detailed information about all models/datasets if name is not - provided. Otherwise return detailed informtiona of the specific model/dataset + provided. Otherwise return detailed informtiona of the specific model/dataset. Raises ------ @@ -123,7 +130,7 @@ def info(name=None): If name that has been passed is incorrect, an exception is raised. """ - url = "https://raw.githubusercontent.com/chaitaliSaini/gensim-data/master/list.json" + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) @@ -151,12 +158,12 @@ def _get_checksum(name, part=None): Parameters ---------- name: str - Dataset/model name + Dataset/model name. Returns ------- str - Retrieved checksum of dataset/model + Retrieved checksum of dataset/model. """ data = info() @@ -169,9 +176,9 @@ def _get_checksum(name, part=None): return data['models'][name]["checksum"] else: if name in corpora: - return data['corpora'][name]["checksum-" + str(part)] + return data['corpora'][name]["checksum-{}".format(part)] elif name in models: - return data['models'][name]["checksum-" + str(part)] + return data['models'][name]["checksum-{}".format(part)] def _get_parts(name): @@ -180,11 +187,11 @@ def _get_parts(name): Parameters ---------- name: str - Dataset/model name + Dataset/model name. Returns ------- - str + int Number of parts in which dataset/model has been split. """ @@ -198,45 +205,38 @@ def _get_parts(name): def _download(name): - """Download and extract the dataset/model + """Download and extract the dataset/model. Parameters ---------- name: str - Dataset/model name which has to be downloaded - - Raises - ------ - Exception - If checksum of dowloaded data does not match the retrieved checksum, - then downloaded has not been donw properly. + Dataset/model name which has to be downloaded. """ - url_load_file = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/__init__.py".format(f=name) + url_load_file = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/__init__.py".format(f=name) data_folder_dir = os.path.join(base_dir, name) tmp_dir = tempfile.mkdtemp() tmp_load_file_path = os.path.join(tmp_dir, "__init__.py") urllib.urlretrieve(url_load_file, tmp_load_file_path) - no_parts = int(_get_parts(name)) + no_parts = _get_parts(name) if no_parts > 1: concatenated_folder_name = "{f}.tar.gz".format(f=name) concatenated_folder_dir = os.path.join(tmp_dir, concatenated_folder_name) - for part in range(1, no_parts + 1): - url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) - compressed_folder_name = "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) + for part in range(0, no_parts): + url_data = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.tar.gz_0{p}".format(f=name, p=part) + compressed_folder_name = "{f}.tar.gz_0{p}".format(f=name, p=part) tmp_data_file_dir = os.path.join(tmp_dir, compressed_folder_name) - logger.info("Downloading Part %s/%s", part, no_parts) - urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=_progress) + urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=partial(_progress, part=part, no_parts=no_parts)) if _calculate_md5_checksum(tmp_data_file_dir) == _get_checksum(name, part): sys.stdout.write("\n") sys.stdout.flush() - logger.info("Part %s/%s downloaded", part, no_parts) + logger.info("Part %s/%s downloaded", part + 1, no_parts) else: shutil.rmtree(tmp_dir) raise Exception("There was a problem in downloading the data. We recommend you to re-try.") with open(concatenated_folder_dir, 'wb') as wfp: - for part in range(1, no_parts + 1): - part_path = os.path.join(tmp_dir, "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part))) + for part in range(0, no_parts): + part_path = os.path.join(tmp_dir, "{f}.tar.gz_0{p}".format(f=name, p=part)) with open(part_path, "rb") as rfp: shutil.copyfileobj(rfp, wfp) os.remove(part_path) @@ -245,7 +245,7 @@ def _download(name): os.remove(concatenated_folder_dir) os.rename(tmp_dir, data_folder_dir) else: - url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) + url_data = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) compressed_folder_name = "{f}.tar.gz".format(f=name) tmp_data_file_dir = os.path.join(tmp_dir, compressed_folder_name) urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=_progress) @@ -292,13 +292,13 @@ def load(name, return_path=False): Parameters ---------- name: str - Name of the model/dataset + Name of the model/dataset. return_path: False or True, optional Returns ------- data: - Load model to memory + Load model to memory. data_dir: str Return path of dataset/model. diff --git a/gensim/test/test_api.py b/gensim/test/test_api.py index a585dbf449..2529ae27e7 100644 --- a/gensim/test/test_api.py +++ b/gensim/test/test_api.py @@ -20,21 +20,21 @@ def test_base_dir_creation(self): def test_load_dataset(self): user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') - data_dir = os.path.join(base_dir, "matrix-synopsis") + data_dir = os.path.join(base_dir, "__testing_matrix-synopsis") dataset_path = os.path.join(data_dir, "matrix-synopsis.txt") if os.path.isdir(base_dir): shutil.rmtree(base_dir) - self.assertEqual(api.load("matrix-synopsis"), dataset_path) + self.assertEqual(api.load("__testing_matrix-synopsis"), dataset_path) shutil.rmtree(base_dir) def test_return_path_dataset(self): user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') - data_dir = os.path.join(base_dir, "matrix-synopsis") + data_dir = os.path.join(base_dir, "__testing_matrix-synopsis") dataset_path = os.path.join(data_dir, "matrix-synopsis.txt") if os.path.isdir(base_dir): shutil.rmtree(base_dir) - self.assertEqual(dataset_path, api.load("matrix-synopsis", return_path=True)) + self.assertEqual(dataset_path, api.load("__testing_matrix-synopsis", return_path=True)) shutil.rmtree(base_dir) def test_load_model(self): @@ -53,7 +53,7 @@ def test_load_model(self): -0.10245913, 0.1170633, 0.16583319, 0.1183883, -0.11217165, 0.1261425, -0.0319365, -0.15787181, 0.03753783, 0.14748634, 0.00414471, -0.02296237, 0.18336892, -0.23840059, 0.17924534]) - model = api.load("word2vec-matrix-synopsis") + model = api.load("__testing_word2vec-matrix-synopsis") vector_dead_calc = model["dead"] self.assertTrue(np.allclose(vector_dead, vector_dead_calc)) shutil.rmtree(base_dir) @@ -61,13 +61,22 @@ def test_load_model(self): def test_return_path_model(self): user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') - data_dir = os.path.join(base_dir, "word2vec-matrix-synopsis") + data_dir = os.path.join(base_dir, "__testing_word2vec-matrix-synopsis") model_path = os.path.join(data_dir, "word2vec-matrix-synopsis.txt") if os.path.isdir(base_dir): shutil.rmtree(base_dir) - self.assertEqual(model_path, api.load("word2vec-matrix-synopsis", return_path=True)) + self.assertEqual(model_path, api.load("__testing_word2vec-matrix-synopsis", return_path=True)) shutil.rmtree(base_dir) + def test_multipart_download(self): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + data_dir = os.path.join(base_dir, '__testing_multipart-matrix-synopsis') + dataset_path = os.path.join(data_dir, 'matrix-synopsis.txt') + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(dataset_path, api.load("__testing_multipart-matrix-synopsis", return_path=True)) + shutil.rmtree(base_dir) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From e899f88ecdb3b379e842f983b9b3136169d73231 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 8 Nov 2017 18:32:57 +0530 Subject: [PATCH 23/24] fixed pep8 --- gensim/test/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/test/test_api.py b/gensim/test/test_api.py index 2529ae27e7..5d222e8514 100644 --- a/gensim/test/test_api.py +++ b/gensim/test/test_api.py @@ -78,6 +78,7 @@ def test_multipart_download(self): self.assertEqual(dataset_path, api.load("__testing_multipart-matrix-synopsis", return_path=True)) shutil.rmtree(base_dir) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 8eeec54c1e9b2041c5c11232f34795788e7358ab Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Thu, 9 Nov 2017 11:23:34 +0530 Subject: [PATCH 24/24] fixed bug --- gensim/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 7f0452ef42..b81a7a29d9 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -141,7 +141,7 @@ def info(name=None): logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) return data['corpora'][name] elif name in models: - logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) + logger.info("%s \n", json.dumps(data['models'][name], indent=4)) return data['models'][name] else: raise Exception(