In [224]:
from data_load import*
from features import*
from classifier import*
from random_guesser import*
In [225]:
import os
import pandas as pd
import re
import random

The lists below are the basis for my features. For the word_list and symbol_list, the program creates a feature based on the number of occurences of the word or symbol divided by the number of characters in the code snippet. For the endings list, if the code snippet ends with one of those strings, the feature receives a value of 10. If it doesn't have one of the listed endings, the feature is valued at 0.

In [226]:
word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format',
             'define', '::', 'done', 'type', 'rescue', 'print', 'elif', 'clone', 'display', '$format', 'echo', 'str',
             'join', '&&', 'val', 'Nil', 'object', '<-', '--', 'lambda', 'var', '//', 'tmpl', 'public function',
             'stdlib', '=>', 'final', 'case', 'impl']
symbol_list = ['$', '^', ',', ';', '&', '|', '!', '*', '@', '#', '(', '{', ' ']
endings = ['end', ')', '}']

The following function creates the data frame of features based on the corpus of code snippets pulled from the Computer Language Benchmarks game.

In [227]:
def data_frame_generator():
    codelist = code_sucker()
    typelist = type_getter()
    df = pd.DataFrame(typelist, index=range(386))
    df.columns = ["Language"]
    df["Code"] = codelist
    df['Language'] = df.Language.apply(lambda x:x.lower())
    for string in word_list:
        def sub_function(code):
            x = string_ratio(string, code)
            return x
        df[string] = df.Code.apply(sub_function)
    for char in symbol_list:
        def sub_function2(code):
            y = character_ratio(code, char)
            return y
        df[char] = df.Code.apply(sub_function2)
    for ending in endings:
        def sub_function3(code):
            z = string_end(ending, code)
            return z
        df['_' + ending] = df.Code.apply(sub_function3)
    return df
In [228]:
df = data_frame_generator()
In [229]:
#df.head(2)

The following function creates the data frame of features based on the code snippets provided for testing the classifier.

In [230]:
def tdata_frame_generator():
    test_codelist = tcode_sucker()
    df = pd.read_csv("test.csv")
    df["Code"] = test_codelist
    for string in word_list:
        def sub_function(code):
            x = string_ratio(string, code)
            return x
        df[string] = df.Code.apply(sub_function)
    for char in symbol_list:
        def sub_function2(code):
            y = character_ratio(code, char)
            return y
        df[char] = df.Code.apply(sub_function2)
    for ending in endings:
        def sub_function3(code):
            z = string_end(ending, code)
            return z
        df['_' + ending] = df.Code.apply(sub_function3)
    return df
In [231]:
test_df = tdata_frame_generator()
In [232]:
x_train, x_test, y_train, y_test = create_xy(df, test_df, word_list[0], 'Language')

I used a Decision Tree Classifier and a Gaussian Naive Bayes Classifier. The Gaussian NB classifier scored higher, and thus I used that in my guess_lang.py program to be run from the console. I also made a random guesser. This is included as a morale booster for whenever I feel like my classifiers are not sufficiently effective.

In [233]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
In [234]:
tree = DecisionTreeClassifier()
run_classifier(tree, x_train, x_test, y_train, y_test)
             precision    recall  f1-score   support

    clojure       1.00      0.75      0.86         4
    haskell       1.00      0.67      0.80         3
       java       0.50      1.00      0.67         2
 javascript       1.00      0.50      0.67         4
      ocaml       1.00      0.50      0.67         2
       perl       0.00      0.00      0.00         0
        php       1.00      0.33      0.50         3
     python       0.50      0.50      0.50         4
       ruby       0.50      0.67      0.57         3
      scala       0.33      1.00      0.50         2
     scheme       0.00      0.00      0.00         3

avg / total       0.71      0.57      0.58        30

[[3 0 0 0 0 0 0 0 1 0 0]
 [0 2 0 0 0 0 0 1 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 2 0 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 1 0]
 [0 0 1 0 0 0 0 2 1 0 0]
 [0 0 0 0 0 1 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 2 0]
 [0 0 1 0 0 0 0 0 0 2 0]]
0.579206349206

In [235]:
gauss = GaussianNB()
run_classifier(gauss, x_train, x_test, y_train, y_test)
             precision    recall  f1-score   support

    clojure       1.00      1.00      1.00         4
    haskell       1.00      0.67      0.80         3
       java       0.67      1.00      0.80         2
 javascript       1.00      0.25      0.40         4
      ocaml       1.00      1.00      1.00         2
       perl       0.00      0.00      0.00         0
        php       0.38      1.00      0.55         3
     python       1.00      1.00      1.00         4
       ruby       1.00      0.33      0.50         3
      scala       0.00      0.00      0.00         2
     scheme       1.00      1.00      1.00         3

avg / total       0.85      0.73      0.72        30

[[4 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 1 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 3 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 1 1 0 1 0 0]
 [0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 3]]
0.724545454545

In [236]:
random_guesser(y_test)
Out[236]:
0.2

Attempting to pickle my classifier

In [237]:
import pickle
In [238]:
pickleclf = GaussianNB()
In [239]:
pickleclf.fit(x_train, y_train)
Out[239]:
GaussianNB()
In [240]:
file_Name = "picklefile"
fileObject = open(file_Name,'wb')
In [241]:
pickle.dump(pickleclf, fileObject)
In [242]:
fileObject.close()
In [243]:
fileObject = open('picklefile','rb')
clf = pickle.load(fileObject)