我有一个代码,可以使用R中的lsa包在短引文上成功执行潜在文本分析(见下文)。但是,我更愿意在较大文档的文本中使用此方法。在每个引用空间中复制粘贴整个事物是非常低效的 - 它可以工作,但需要永久运行。有没有什么方法可以直接从数据库或数据框导入每个“引用”(在本例中为文档)?如果是这样,它应该采用何种格式?导入R时,Txt格式文档会自动分成段落,我不确定这与lsa包执行的分析是否兼容。
#!/usr/bin/env python
from numbers import Integral
def tuple_to_int(tup):
""":Return: the unique non-negative integer encoding of a tuple of non-negative integers."""
if len(tup) == 0: # normally do if not tup, but doesn't work with np
raise ValueError('Cannot encode empty tuple')
if len(tup) == 1:
x = tup[0]
if not isinstance(x, Integral):
raise ValueError('Can only encode integers')
return x
elif len(tup) == 2:
# print("len=2")
x, y = tuple_to_int(tup[0:1]), tuple_to_int(tup[1:2]) # Just to validate x and y
X = 2 * x if x >= 0 else -2 * x - 1 # map x to positive integers
Y = 2 * y if y >= 0 else -2 * y - 1 # map y to positive integers
Z = (X * X + X + Y) if X >= Y else (X + Y * Y) # encode
# Map evens onto positives
if (x >= 0 and y >= 0):
return Z // 2
elif (x < 0 and y >= 0 and X >= Y):
return Z // 2
elif (x < 0 and y < 0 and X < Y):
return Z // 2
# Map odds onto negative
else:
return (-Z - 1) // 2
else:
return tuple_to_int((tuple_to_int(tup[:2]),) + tuple(tup[2:])) # ***speed up tuple(tup[2:])?***
def int_to_tuple(num, size=2):
""":Return: the unique tuple of length `size` that encodes to `num`."""
if not isinstance(num, Integral):
raise ValueError('Can only encode integers (got {})'.format(num))
if not isinstance(size, Integral) or size < 1:
raise ValueError('Tuple is the wrong size ({})'.format(size))
if size == 1:
return (num,)
elif size == 2:
# Mapping onto positive integers
Z = -2 * num - 1 if num < 0 else 2 * num
# Reversing Pairing
s = isqrt(Z)
if Z - s * s < s:
X, Y = Z - s * s, s
else:
X, Y = s, Z - s * s - s
# Undoing mappint to positive integers
x = (X + 1) // -2 if X % 2 else X // 2 # True if X not divisible by 2
y = (Y + 1) // -2 if Y % 2 else Y // 2 # True if Y not divisible by 2
return x, y
else:
x, y = int_to_tuple(num, 2)
return int_to_tuple(x, size - 1) + (y,)
def isqrt(n):
"""":Return: the largest integer x for which x * x does not exceed n."""
# Newton's method, via http://stackoverflow.com/a/15391420
x = n
y = (x + 1) // 2
while y < x:
x = y
y = (x + n // x) // 2
return x