python - Sklearn classifier and flask issues -


i have been trying self host apache sklearn classifier put together, , ended using joblib serialize saved model, load in flask app. now, app worked when running flask's built in development server, when set debian 9 apache server, 500 error. delving apache's error.log, get:

attributeerror: module '__main__' has no attribute 'tokenize' 

now, funny me because while did write own tokenizer, web app gave me no problems when running locally. furthermore, saved model used trained on webserver, different library versions should not problem.

my code web app is:

import re import sys  flask import flask, request, render_template nltk import word_tokenize nltk.stem.wordnet import wordnetlemmatizer sklearn.externals import joblib  app = flask(__name__)    def tokenize(text):     # text = text.translate(str.maketrans('','',string.punctuation))     text = re.sub(r'\w+', ' ', text)     tokens = word_tokenize(text)     lemas = []     item in tokens:         lemas.append(wordnetlemmatizer().lemmatize(item))     return lemas  @app.route('/') def home():     return render_template('home.html')  @app.route('/analyze',methods=['post','get']) def analyze():     if request.method=='post':         result=request.form         input_text = result['input_text']          clf = joblib.load("model.pkl.z")         parameters = clf.named_steps['clf'].get_params()         predicted = clf.predict([input_text])         # print(predicted)         certainty = clf.decision_function([input_text])          # bonkers?         if predicted[0]:             verdict = "not nuts!"         else:             verdict = "bonkers!"          return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters])  if __name__ == '__main__':     #app.debug = true     app.run() 

with .wsgi file being:

import sys  sys.path.append('/var/www/mysite')  conspiracydetector import app application 

furthermore, trained model code:

import logging import pprint  # pretty stuff import re import sys  # command line arguments time import time  # show progress  import numpy np nltk import word_tokenize nltk.stem.wordnet import wordnetlemmatizer sklearn import metrics sklearn.datasets import load_files sklearn.externals import joblib  # in order save sklearn.feature_extraction.text import tfidfvectorizer sklearn.model_selection import gridsearchcv sklearn.model_selection import train_test_split sklearn.pipeline import pipeline sklearn.svm import linearsvc  # tokenizer stemming , strips punctuation def tokenize(text):     # text = text.translate(str.maketrans('','',string.punctuation))     text = re.sub(r'\w+', ' ', text)     tokens = word_tokenize(text)     lemas = []     item in tokens:         lemas.append(wordnetlemmatizer().lemmatize(item))     return lemas  if __name__ == "__main__":     # note: put following in 'if __name__ == "__main__"' protected     # block able use multi-core grid search works under     # windows, see: http://docs.python.org/library/multiprocessing.html#windows     # multiprocessing module used backend of joblib.parallel     # used when n_jobs != 1 in gridsearchcv      # display progress logs on stdout     print("initializing...")     # command line arguments     save = sys.argv[1]     training_directory = sys.argv[2]      logging.basicconfig(level=logging.info,                         format='%(asctime)s %(levelname)s %(message)s')      dataset = load_files(training_directory, shuffle=false)     print("n_samples: %d" % len(dataset.data))      # split dataset in training , test set:     print("splitting dataset in training , test set...")     docs_train, docs_test, y_train, y_test = train_test_split(         dataset.data, dataset.target, test_size=0.25, random_state=none)      # build vectorizer / classifier pipeline filters out tokens     # rare or frequent     # remove stop words     print("loading list of stop words...")     open('stopwords.txt', 'r') f:         words = [line.strip() line in f]      print("stop words list loaded...")     print("setting pipeline...")     pipeline = pipeline(         [             # ('vect', tfidfvectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))),             ('vect',              tfidfvectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))),             ('clf', linearsvc(c=5000)),         ])      print("pipeline:", [name name, _ in pipeline.steps])      # build grid search find out whether unigrams or bigrams     # more useful.     # fit pipeline on training set using grid search parameters     print("initializing grid search...")      # uncommenting more parameters give better exploring power     # increase processing time in combinatorial way     parameters = {         # 'vect__ngram_range': [(1, 1), (1, 2)],         # 'vect__min_df': (0.0005, 0.001),         # 'vect__max_df': (0.25, 0.5),         # 'clf__c': (10, 15, 20),     }     print("parameters:")     pprint.pprint(parameters)     grid_search = gridsearchcv(         pipeline,         parameters,         n_jobs=-1,         verbose=true)      print("training , performing grid search...\n")     t0 = time()     grid_search.fit(docs_train, y_train)     print("\ndone in %0.3fs!\n" % (time() - t0))      # print mean , std each candidate along parameter     # settings candidates explored grid search.     n_candidates = len(grid_search.cv_results_['params'])     in range(n_candidates):         print(i, 'params - %s; mean - %0.2f; std - %0.2f'               % (grid_search.cv_results_['params'][i],                  grid_search.cv_results_['mean_test_score'][i],                  grid_search.cv_results_['std_test_score'][i]))      # predict outcome on testing set , store in variable     # named y_predicted     print("\nrunning against testing set...\n")     y_predicted = grid_search.predict(docs_test)      # save model     print("\nsaving model to", save, "...")     joblib.dump(grid_search.best_estimator_, save)     print("model saved! \nprepare awesome stats!") 

i must confess pretty stumped, , after tinkering around, searching, , making sure server configured correctly, felt perhaps here might able help. appreciated, , if there more information need provide, please let me know , happy to.

also, running:

  • python 3.5.3 nltk , sklearn.

i solved problem, although imperfectly, removing custom tokenizer , falling on 1 of sklearn's.

however, still in dark on how integrate own tokenizer.


Comments

Popular posts from this blog

php - Vagrant up error - Uncaught Reflection Exception: Class DOMDocument does not exist -

vue.js - Create hooks for automated testing -

Add new key value to json node in java -