python - Sklearn classifier and flask issues -
i have been trying self host apache
sklearn
classifier put together, , ended using joblib
serialize saved model, load in flask app. now, app worked when running flask's built in development server, when set debian 9 apache server, 500 error. delving apache's error.log
, get:
attributeerror: module '__main__' has no attribute 'tokenize'
now, funny me because while did write own tokenizer, web app gave me no problems when running locally. furthermore, saved model used trained on webserver, different library versions should not problem.
my code web app is:
import re import sys flask import flask, request, render_template nltk import word_tokenize nltk.stem.wordnet import wordnetlemmatizer sklearn.externals import joblib app = flask(__name__) def tokenize(text): # text = text.translate(str.maketrans('','',string.punctuation)) text = re.sub(r'\w+', ' ', text) tokens = word_tokenize(text) lemas = [] item in tokens: lemas.append(wordnetlemmatizer().lemmatize(item)) return lemas @app.route('/') def home(): return render_template('home.html') @app.route('/analyze',methods=['post','get']) def analyze(): if request.method=='post': result=request.form input_text = result['input_text'] clf = joblib.load("model.pkl.z") parameters = clf.named_steps['clf'].get_params() predicted = clf.predict([input_text]) # print(predicted) certainty = clf.decision_function([input_text]) # bonkers? if predicted[0]: verdict = "not nuts!" else: verdict = "bonkers!" return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters]) if __name__ == '__main__': #app.debug = true app.run()
with .wsgi file being:
import sys sys.path.append('/var/www/mysite') conspiracydetector import app application
furthermore, trained model code:
import logging import pprint # pretty stuff import re import sys # command line arguments time import time # show progress import numpy np nltk import word_tokenize nltk.stem.wordnet import wordnetlemmatizer sklearn import metrics sklearn.datasets import load_files sklearn.externals import joblib # in order save sklearn.feature_extraction.text import tfidfvectorizer sklearn.model_selection import gridsearchcv sklearn.model_selection import train_test_split sklearn.pipeline import pipeline sklearn.svm import linearsvc # tokenizer stemming , strips punctuation def tokenize(text): # text = text.translate(str.maketrans('','',string.punctuation)) text = re.sub(r'\w+', ' ', text) tokens = word_tokenize(text) lemas = [] item in tokens: lemas.append(wordnetlemmatizer().lemmatize(item)) return lemas if __name__ == "__main__": # note: put following in 'if __name__ == "__main__"' protected # block able use multi-core grid search works under # windows, see: http://docs.python.org/library/multiprocessing.html#windows # multiprocessing module used backend of joblib.parallel # used when n_jobs != 1 in gridsearchcv # display progress logs on stdout print("initializing...") # command line arguments save = sys.argv[1] training_directory = sys.argv[2] logging.basicconfig(level=logging.info, format='%(asctime)s %(levelname)s %(message)s') dataset = load_files(training_directory, shuffle=false) print("n_samples: %d" % len(dataset.data)) # split dataset in training , test set: print("splitting dataset in training , test set...") docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=none) # build vectorizer / classifier pipeline filters out tokens # rare or frequent # remove stop words print("loading list of stop words...") open('stopwords.txt', 'r') f: words = [line.strip() line in f] print("stop words list loaded...") print("setting pipeline...") pipeline = pipeline( [ # ('vect', tfidfvectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))), ('vect', tfidfvectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))), ('clf', linearsvc(c=5000)), ]) print("pipeline:", [name name, _ in pipeline.steps]) # build grid search find out whether unigrams or bigrams # more useful. # fit pipeline on training set using grid search parameters print("initializing grid search...") # uncommenting more parameters give better exploring power # increase processing time in combinatorial way parameters = { # 'vect__ngram_range': [(1, 1), (1, 2)], # 'vect__min_df': (0.0005, 0.001), # 'vect__max_df': (0.25, 0.5), # 'clf__c': (10, 15, 20), } print("parameters:") pprint.pprint(parameters) grid_search = gridsearchcv( pipeline, parameters, n_jobs=-1, verbose=true) print("training , performing grid search...\n") t0 = time() grid_search.fit(docs_train, y_train) print("\ndone in %0.3fs!\n" % (time() - t0)) # print mean , std each candidate along parameter # settings candidates explored grid search. n_candidates = len(grid_search.cv_results_['params']) in range(n_candidates): print(i, 'params - %s; mean - %0.2f; std - %0.2f' % (grid_search.cv_results_['params'][i], grid_search.cv_results_['mean_test_score'][i], grid_search.cv_results_['std_test_score'][i])) # predict outcome on testing set , store in variable # named y_predicted print("\nrunning against testing set...\n") y_predicted = grid_search.predict(docs_test) # save model print("\nsaving model to", save, "...") joblib.dump(grid_search.best_estimator_, save) print("model saved! \nprepare awesome stats!")
i must confess pretty stumped, , after tinkering around, searching, , making sure server configured correctly, felt perhaps here might able help. appreciated, , if there more information need provide, please let me know , happy to.
also, running:
- python 3.5.3 nltk , sklearn.
i solved problem, although imperfectly, removing custom tokenizer , falling on 1 of sklearn's.
however, still in dark on how integrate own tokenizer.
Comments
Post a Comment