from data_load import*
from features import*
from classifier import*
from random_guesser import*

import os
import pandas as pd
import re
import random

The lists below are the basis for my features. For the word_list and symbol_list, the program creates a feature based on the number of occurences of the word or symbol divided by the number of characters in the code snippet. For the endings list, if the code snippet ends with one of those strings, the feature receives a value of 10. If it doesn't have one of the listed endings, the feature is valued at 0.¶

word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format',
             'define', '::', 'done', 'type', 'rescue', 'print', 'elif', 'clone', 'display', '$format', 'echo', 'str',
             'join', '&&', 'val', 'Nil', 'object', '<-', '--', 'lambda', 'var', '//', 'tmpl', 'public function',
             'stdlib', '=>', 'final', 'case', 'impl']
symbol_list = ['$', '^', ',', ';', '&', '|', '!', '*', '@', '#', '(', '{', ' ']
endings = ['end', ')', '}']

The following function creates the data frame of features based on the corpus of code snippets pulled from the Computer Language Benchmarks game.¶

def data_frame_generator():
    codelist = code_sucker()
    typelist = type_getter()
    df = pd.DataFrame(typelist, index=range(386))
    df.columns = ["Language"]
    df["Code"] = codelist
    df['Language'] = df.Language.apply(lambda x:x.lower())
    for string in word_list:
        def sub_function(code):
            x = string_ratio(string, code)
            return x
        df[string] = df.Code.apply(sub_function)
    for char in symbol_list:
        def sub_function2(code):
            y = character_ratio(code, char)
            return y
        df[char] = df.Code.apply(sub_function2)
    for ending in endings:
        def sub_function3(code):
            z = string_end(ending, code)
            return z
        df['_' + ending] = df.Code.apply(sub_function3)
    return df

df = data_frame_generator()

#df.head(2)

The following function creates the data frame of features based on the code snippets provided for testing the classifier.¶

def tdata_frame_generator():
    test_codelist = tcode_sucker()
    df = pd.read_csv("test.csv")
    df["Code"] = test_codelist
    for string in word_list:
        def sub_function(code):
            x = string_ratio(string, code)
            return x
        df[string] = df.Code.apply(sub_function)
    for char in symbol_list:
        def sub_function2(code):
            y = character_ratio(code, char)
            return y
        df[char] = df.Code.apply(sub_function2)
    for ending in endings:
        def sub_function3(code):
            z = string_end(ending, code)
            return z
        df['_' + ending] = df.Code.apply(sub_function3)
    return df

test_df = tdata_frame_generator()

x_train, x_test, y_train, y_test = create_xy(df, test_df, word_list[0], 'Language')

I used a Decision Tree Classifier and a Gaussian Naive Bayes Classifier. The Gaussian NB classifier scored higher, and thus I used that in my guess_lang.py program to be run from the console. I also made a random guesser. This is included as a morale booster for whenever I feel like my classifiers are not sufficiently effective.¶

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
run_classifier(tree, x_train, x_test, y_train, y_test)

             precision    recall  f1-score   support

    clojure       1.00      0.75      0.86         4
    haskell       1.00      0.67      0.80         3
       java       0.50      1.00      0.67         2
 javascript       1.00      0.50      0.67         4
      ocaml       1.00      0.50      0.67         2
       perl       0.00      0.00      0.00         0
        php       1.00      0.33      0.50         3
     python       0.50      0.50      0.50         4
       ruby       0.50      0.67      0.57         3
      scala       0.33      1.00      0.50         2
     scheme       0.00      0.00      0.00         3

avg / total       0.71      0.57      0.58        30

[[3 0 0 0 0 0 0 0 1 0 0]
 [0 2 0 0 0 0 0 1 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 2 0 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 1 0]
 [0 0 1 0 0 0 0 2 1 0 0]
 [0 0 0 0 0 1 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 2 0]
 [0 0 1 0 0 0 0 0 0 2 0]]
0.579206349206

gauss = GaussianNB()
run_classifier(gauss, x_train, x_test, y_train, y_test)

             precision    recall  f1-score   support

    clojure       1.00      1.00      1.00         4
    haskell       1.00      0.67      0.80         3
       java       0.67      1.00      0.80         2
 javascript       1.00      0.25      0.40         4
      ocaml       1.00      1.00      1.00         2
       perl       0.00      0.00      0.00         0
        php       0.38      1.00      0.55         3
     python       1.00      1.00      1.00         4
       ruby       1.00      0.33      0.50         3
      scala       0.00      0.00      0.00         2
     scheme       1.00      1.00      1.00         3

avg / total       0.85      0.73      0.72        30

[[4 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 1 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 3 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 1 1 0 1 0 0]
 [0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 3]]
0.724545454545

random_guesser(y_test)

0.2

Attempting to pickle my classifier¶

import pickle

pickleclf = GaussianNB()

pickleclf.fit(x_train, y_train)

GaussianNB()

file_Name = "picklefile"
fileObject = open(file_Name,'wb')

pickle.dump(pickleclf, fileObject)

fileObject.close()

fileObject = open('picklefile','rb')
clf = pickle.load(fileObject)