Question

我使用R，并且将此字符串作为行，我需要将其拆分为列

'id'：1050442590754103297，'id_str'：'1050442590754103297'，'name'： 'امرودينا'，'screen_name'：'uclkGkQ5'，'location'：无， 'url'：无，'description'：'\ u200f \ u200fمنزويالاحتياجات الخاصه'，'translator_type'：'none'，'protected'：False， 'verified'：False，'followers_count'：1567，'friends_count'：4019， 'listed_count'：0，'favourites_count'：6669，'statuses_count'：9279， 'created_at'：'Thu Oct 11 17:46:44 +0000 2018'，'utc_offset'：无， 'time_zone'：无，'geo_enabled'：False，'lang'：'ar'， 'contributors_enabled'：False，'is_translator'：False， 'profile_background_color'：'F5F8FA'，'profile_background_image_url'： ''，'profile_background_image_url_https'：''， 'profile_background_tile'：False，'profile_link_color'：'1DA1F2'， 'profile_sidebar_border_color'：'C0DEED'， 'profile_sidebar_fill_color'：'DDEEF6'，'profile_text_color'： '333333'，'profile_use_background_image'：true，'profile_image_url'： 'http://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg'， 'profile_image_url_https'： 'https://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg'， 'profile_banner_url'： 'https://pbs.twimg.com/profile_banners/1050442590754103297/1539390015'， 'default_profile'：真，'default_profile_image'：假，'跟随'：无，“ follow_request_sent”：无，“通知”：无

我尝试了此代码，但我需要指定所需的列数，并且还需要在最后对列进行重命名，因此这很困难且耗时

d<-str_split_fixed(try$user, ",", 4)

我得到的结果是，它没有列名：

'id': 1050442590754103297    'id_str': '1050442590754103297'   'name': 'ام رودينا <U+267F>'

第四列包含字符串的其余部分

'screen_name': 'uclkGkQ5', 'location': None, 'url': None, 'description': '\u200f\u200fمن زوي الاحتياجات الخاصه<U+267F>', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 1567, 'friends_count': 4019, 'listed_count': 0, 'favourites_count': 6669, 'statuses_count': 9279, 'created_at': 'Thu Oct 11 17:46:44 +0000 2018', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 'ar', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1050442590754103297/1539390015', 'default_profile': True, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None

我需要一个代码，该代码根据逗号分割行，并使列名成为（:)之前的单词，如下所示：

 id                         id_str                    name        screen_name     
 1050442590754103297      1050442590754103297       ام رودينا \u267f           uclkGkQ5

，其余字符串相同希望您理解我并谢谢

Answer 1

这模仿（但不合法）JSON。一种方法（如果假设是真的）将是“转换为JSON”并从那里进行解析。

在前面，我在R会话中遇到一个问题，即阿拉伯字母未正确存储在字符串中。这是在调用gsub等之前发生的，因此，我认为它可能在您的计算机上可以正常工作。因此，您将在输出中看到空字符串。（建议在本地进行此操作，我想先介绍一下代码。）

假设：

没有嵌入的双引号
True / False / None文字字符串从不嵌入到文本中，除了逻辑值之外（例如，'screen_name':'Is None'不会发生）
您愿意在数据中找到NULL之前的:None

重要的转化

True和False小写
None至null
将整个内容封装在字典中，并带有{和}
将所有单引号转换为双引号

可以在这里使用magrittr管道以提高可读性，或者您可以嵌套所有功能（快得多）：

out <- jsonlite::fromJSON(
  paste0("{", gsub(":\\s*True\\s*(,?)", ":true\\1",
                   gsub(":\\s*False\\s*(,?)", ":false\\1",
                        gsub(":\\s*None\\s*(,?)", ":null\\1",
                             gsub("'", '"', s)))),
         "}"))
# or
library(magrittr)
out <- s %>%
  gsub(":\\s*True\\s*(,?)", ":true\\1", .) %>%
  gsub(":\\s*False\\s*(,?)", ":false\\1", .) %>%
  gsub(":\\s*None\\s*(,?)", ":null\\1", .) %>%
  gsub("'", '"', .) %>%
  paste0("{", ., "}") %>%
  jsonlite::fromJSON(.)

结果（与str相比）：

str(out)
# List of 39
#  $ id                                : num 1.05e+18
#  $ id_str                            : chr "1050442590754103297"
#  $ name                              : chr "          "
#  $ screen_name                       : chr "uclkGkQ5"
#  $ location                          : NULL
#  $ url                               : NULL
#  $ description                       : chr "<U+200F><U+200F>                        "
#  $ translator_type                   : chr "none"
#  $ protected                         : logi FALSE
#  $ verified                          : logi FALSE
#  $ followers_count                   : int 1567
#  $ friends_count                     : int 4019
#  $ listed_count                      : int 0
#  $ favourites_count                  : int 6669
#  $ statuses_count                    : int 9279
#  $ created_at                        : chr "Thu Oct 11 17:46:44 +0000 2018"
#  $ utc_offset                        : NULL
#  $ time_zone                         : NULL
#  $ geo_enabled                       : logi FALSE
#  $ lang                              : chr "ar"
#  $ contributors_enabled              : logi FALSE
#  $ is_translator                     : logi FALSE
#  $ profile_background_color          : chr "F5F8FA"
#  $ profile_background_image_url      : chr ""
#  $ profile_background_image_url_https: chr ""
#  $ profile_background_tile           : logi FALSE
#  $ profile_link_color                : chr "1DA1F2"
#  $ profile_sidebar_border_color      : chr "C0DEED"
#  $ profile_sidebar_fill_color        : chr "DDEEF6"
#  $ profile_text_color                : chr "333333"
#  $ profile_use_background_image      : logi TRUE
#  $ profile_image_url                 : chr "http://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg"
#  $ profile_image_url_https           : chr "https://pbs.twimg.com/profile_images/1059769079790268416/sJpep_V8_normal.jpg"
#  $ profile_banner_url                : chr "https://pbs.twimg.com/profile_banners/1050442590754103297/1539390015"
#  $ default_profile                   : logi TRUE
#  $ default_profile_image             : logi FALSE
#  $ following                         : NULL
#  $ follow_request_sent               : NULL
#  $ notifications                     : NULL

注意：

提醒：这里的空格是我的本地“区域设置”设置的问题，不一定是这里的过程（对不起，我不经常在控制台上处理非ASCII）
大整数id字段将转换为numeric，并且在不使用bit64包的情况下不能为整数，我不知道这是否会对您造成问题
我尝试对替换进行“细心”处理，并且对正则表达式模式非常具体，寻找前面的冒号，没有刻度线和可选的后续逗号（因为未遵循最后一个None通过任何方式）;尽可能具体，以减轻错误地替换字符串中的这些文字的可能性
我再次强调只有单引号的假设...如果存在双引号，它们会弄乱整个事情，而且我还没有考虑过处理它们

将行字符串拆分为R

1 个答案: