ValueError: Found array with 0 sample(s) (shape=(0, 262)) while a minimum of 1 is required.
from collections import Counter
import pandas
from nltk.corpus import stopwords
import pandas as pd
import numpy
headlines = []
apps = pd.read_csv('DataUse.csv')
for e in apps['title_lower']:
testdata = pd.read_csv('testdata.csv')
# Find all the unique words in the headlines.
unique_words = list(set(" ".join(headlines).split(" ")))
def make_matrix(headlines, vocab):
matrix = []
for headline in headlines:
# Count each word in the headline, and make a dictionary.
counter = Counter(headline)
# Turn the dictionary into a matrix row using the vocab.
row = [counter.get(w, 0) for w in vocab]
df = pandas.DataFrame(matrix)
df.columns = unique_words
return df
print(make_matrix(headlines, unique_words))
import re
# Lowercase, then replace any non-letter, space, or digit character in the headlines.
new_headlines = [re.sub(r'[^\w\s\d]','',h.lower()) for h in headlines]
# Replace sequences of whitespace with a space character.
new_headlines = [re.sub("\s+", " ", h) for h in new_headlines]
unique_words = list(set(" ".join(new_headlines).split(" ")))
# We've reduced the number of columns in the matrix a bit.
print(make_matrix(new_headlines, unique_words))
stopwords = set(stopwords.words('english'))
stopwords = [re.sub(r'[^\w\s\d]','',s.lower()) for s in stopwords]
unique_words = list(set(" ".join(new_headlines).split(" ")))
# Remove stopwords from the vocabulary.
unique_words = [w for w in unique_words if w not in stopwords]
# We're down to 34 columns, which is way better!
print(make_matrix(new_headlines, unique_words))
from sklearn.feature_extraction.text import CountVectorizer
# Construct a bag of words matrix.
# This will lowercase everything, and ignore all punctuation by default.
# It will also remove stop words.
vectorizer = CountVectorizer(lowercase=True, stop_words="english")
matrix = vectorizer.fit_transform(headlines)
# We created our bag of words matrix with far fewer commands.
# Let's apply the same method to all the headlines in all 100000 submissions.
# We'll also add the url of the submission to the end of the headline so we can take it into account.
full_matrix = vectorizer.fit_transform(apps['title_lower'])
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Convert the upvotes variable to binary so it works with a chi-squared test.
col = apps["total_shares"].copy(deep=True)
col_mean = col.mean()
col[col < col_mean] = 0
col[(col > 0) & (col > col_mean)] = 1
print col
# Find the 1000 most informative columns
selector = SelectKBest(chi2, k='all')
selector.fit(full_matrix, col)
top_words = selector.get_support().nonzero()
# Pick only the most informative columns in the data.
chi_matrix = full_matrix[:,top_words[0]]
import numpy as numpy
transform_functions = [
lambda x: len(x),
lambda x: x.count(" "),
lambda x: x.count("."),
lambda x: x.count("!"),
lambda x: x.count("?"),
lambda x: len(x) / (x.count(" ") + 1),
lambda x: x.count(" ") / (x.count(".") + 1),
lambda x: len(re.findall("\d", x)),
lambda x: len(re.findall("[A-Z]", x)),
# Apply each function and put the results into a list.
columns = []
for func in transform_functions:
# Convert the meta features to a numpy array.
meta = numpy.asarray(columns).T
features = numpy.hstack([chi_matrix.todense()])
from sklearn.linear_model import Ridge
import random
train_rows = 262
# Set a seed to get the same "random" shuffle every time.
# Shuffle the indices for the matrix.
indices = list(range(features.shape[0]))
# Create train and test sets.
train = features[indices[:train_rows], :]
test = features[indices[train_rows:], :]
print test
train_upvotes = apps['total_shares'].iloc[indices[:train_rows]]
test_upvotes = apps['total_shares'].iloc[indices[train_rows:]]
train = numpy.nan_to_num(train)
print (test)
# Run the regression and generate predictions for the test set.
reg = Ridge(alpha=.1)
reg.fit(train, train_upvotes)
predictions = reg.predict(test)
### We're going to use mean absolute error as an error metric.
### Our error is about 13.6 upvotes, which means that, on average,
### our prediction is 13.6 upvotes away from the actual number of upvotes.
##print(sum(abs(predictions - test_upvotes)) / len(predictions))
### As a baseline, we'll use the average number of upvotes
### across all submissions.
### The error here is 17.2 -- our estimate is better, but not hugely so.
### There either isn't a ton of predictive value encoded in the
### data we have, or we aren't extracting it well.
##average_upvotes = sum(test_upvotes)/len(test_upvotes)
##print(sum(abs(average_upvotes - test_upvotes)) / len(predictions))
Traceback (most recent call last):
File "C:/Users/Tucker Siegel/Desktop/Machines/Test.py", line 156, in <module>
predictions = reg.predict(test)
File "C:\Python27\lib\site-packages\sklearn\linear_model\base.py", line 200, in predict
return self._decision_function(X)
File "C:\Python27\lib\site-packages\sklearn\linear_model\base.py", line 183, in _decision_function
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
File "C:\Python27\lib\site-packages\sklearn\utils\validation.py", line 407, in check_array
ValueError: Found array with 0 sample(s) (shape=(0, 262)) while a minimum of 1 is required.