我的脚本输出为年份,以及该年份的文章字数:
public override void ImageDraw(Pdoc pdfDoc, Bitmap bitmap, int pagePos)
{
Graphics targetGraphics = Graphics.FromImage(bitmap);
PDFPage previewPage = null;
if (pdfDoc.GetPageNum() == 1)
{
previewPage = pdfDoc.GetPage(0);
}
else
{
previewPage = pdfDoc.GetPage(pagePos);
}
targetGraphics.PageUnit = GraphicsUnit.Point;
PointF[] pointImageSize = { new PointF(bitmap.Width, bitmap.Height) };
targetGraphics.TransformPoints(CoordinateSpace.Page, CoordinateSpace.Device, pointImageSize);
PointF[] pixelPageSize = { new PointF(previewPage.Width, previewPage.Height) };
targetGraphics.TransformPoints(CoordinateSpace.Device, CoordinateSpace.Page, pixelPageSize);
targetGraphics.PageUnit = GraphicsUnit.Pixel;
float scale = bitmap.Width / pixelPageSize[0].X;
float matrixHeight = 0.0F;
switch (this._PreviewTarget)
{
case PreivewTarget.Header:
matrixHeight = (previewPage.Height * scale);
break;
case PreivewTarget.Footer:
matrixHeight = previewPage.Height - (previewPage.Height - pointImageSize[0].Y);
break;
}
Matrix matrix = new Matrix(scale, 0, 0, -scale, 0, matrixHeight);
matrix.Rotate(90);
RectangleF drawRect = new RectangleF(0, 0, previewPage.Width, previewPage.Height);
previewPage.Draw(targetGraphics, matrix, drawRect, PDPage.DrawFlags.kPDPageUseAnnotFaces);
targetGraphics.Dispose();
}
我希望每年将其作为仅包含单词的现有数据框的新列添加。
预期输出:
abcd
2013
118
2014
23
xyz
2013
1
2014
45
我的脚本的输入是一个csv文件:
Terms 2013 2014 2015
abc 118 76 90
xyz 23 0 36
我写的脚本是:
Terms
xyz
abc
efg
df = pd.read_csv('a.csv', header = None)
for row in df.itertuples():
term = (str(row[1]))
u = "http: term=%s&mindate=%d/01/01&maxdate=%d/12/31"
print(term)
startYear = 2013
endYear = 2018
for year in range(startYear, endYear+1):
url = u % (term.replace(" ", "+"), year, year)
page = urllib.request.urlopen(url).read()
doc = ET.XML(page)
count = doc.find("Count").text
print(year)
print(count)
是:
df.head
任何帮助将不胜感激,在此先感谢!!
答案 0 :(得分:1)
我会在数组中使用numpy读取csv,然后也使用numpy将其重新整形,然后将所得的matrix / 2D数组转换为DataFrame
答案 1 :(得分:0)
应该执行以下操作:
#!/usr/bin/env python
def mkdf(filename):
def combine(term, l):
d = {"term": term}
d.update(dict(zip(l[::2], l[1::2])))
return d
term = None
other = []
with open(filename) as I:
n = 0
for line in I:
line = line.strip()
try:
int(line)
except Exception as e:
# not an int
if term: # if we have one, create the record
yield combine(term, other)
term = line
other = []
n = 0
else:
if n > 0:
other.append(line)
n += 1
# and the last one
yield combine(term, other)
if __name__ == "__main__":
import pandas as pd
import sys
df = pd.DataFrame([r for r in mkdf(sys.argv[1])])
print(df)
用法:python scriptname.py / tmp / IN(或其他包含您数据的文件)
输出:
2013 2014 term
0 118 23 abcd
1 1 45 xyz