T-SQL员工层次结构递归查询

时间:2020-05-23 00:32:43

标签: sql sql-server tsql

我想使用T-SQL查询(可以是递归CTE或其他任何查询),其中 我可以得到以下突出显示的输出

SQL如下创建示例表

from lxml import etree
import csv
import wikitextparser as wtp
import re
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
import urllib.parse
import pandas as pd

import requests
from IPython.core.display import display, HTML
# An API Error Exception
class APIError(Exception):
    def __init__(self, status):
        self.status = status
    def __str__(self):
        return "APIError: status={}".format(self.status)

lista = []
all_urls = []
all_texts = []
all_words = []

global D 
class WikiLinkTextProcessor:

    def __init__(self, input_xml):
        self.input_xml = input_xml
        self.wikilinks = {} # key is wikilink text, value is set of stringified Wikilink objects


    @staticmethod
    def stringify_wikilink(title, wikilink):

        wikilink = title + "@" + wikilink.title + "|" + wikilink.string


        wikilink = wikilink.strip()
        wikilink = re.sub(r"[#\|]$","",wikilink)
        return  wikilink

    def add_text_link(self,title,text,wikilink):
        if text is None or re.match(r"^\s*$",text):
            return
        text = re.sub(r"\s+"," ",text).strip()
        if text not in self.wikilinks:
            self.wikilinks[text] = set()
        self.wikilinks[text].add(WikiLinkTextProcessor.stringify_wikilink(title, wikilink))


    def set_wikilinks(self):
        tree = etree.parse(self.input_xml)
        root = tree.getroot()

        print(root.findall(".//title"))
        # iterate through all the text
        for page in root.findall(".//page", namespaces=root.nsmap):
            title = page.findall(".//title", namespaces=page.nsmap)
            title = title[0].text
            for text in page.findall(".//text", namespaces=page.nsmap):
                if text.text is None or len(text.text.strip())==0:
                    continue
                parsed = wtp.parse(text.text)

                ## initial consts
                BASE_URL = 'http://api.dbpedia-spotlight.org/en/annotate?text={text}&confidence={confidence}&support={support}&sparql={sparql}'
                #print(BASE_URL)
                #print(type(section))
                #TEXT = open("/Users/prosy/.keras/datasets/enwiki-latest-pages-articles14.xml-p7697595p7744800","r",encoding='utf-8').read()

                TEXT = str(parsed)

                SPARQL = 'SELECT%20DISTINCT%20%3Fissue%20WHERE%20%7BVALUES%20%3Fissue%7B%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FDeforestation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FPollution%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FHuman_overpopulation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FBiocapacity%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FClimate_change%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FCarrying_capacity%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FIndustrialisation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FI_%3D_PAT%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FLand_degradation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FLand_reclamation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FOptimum_population%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FList_of_countries_and_territories_by_population_density%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FPopulation_dynamics%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FTotal_fertility_rate%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FUrbanization%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FWater_conflict%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FWater_scarcity%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FOverdrafting%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FTile_drainage%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FAgriculture%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FIntensive_farming%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FAgricultural_subsidy%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FIntensive_animal_farming%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FNutrient_pollution%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FOvergrazing%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FPesticide_drift%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FPlasticulture%3E%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FTile_drainage%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FBuilt_environment%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FDesertification%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FHabitat_fragmentation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FHabitat_destruction%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FLand_degradation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FTrail_ethics%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FUrban_heat_island%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FUrban_sprawl%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FNanotechnology%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FNuclear_fallout%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FNuclear_meltdown%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FRadioactive_decay%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FNuclear_safety_and_security%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FHigh-level_radioactive_waste_management%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FMonoculture%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FIrrigation%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FUrbanization%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FSwamp%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FLandslide%3E%20%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FIndustrialisation%3E%20%20%20%20%20%7D%20%20%3Fissue%20%3Fa%20%3Fb%7D%20'

                CONFIDENCE = '0.2'
                SUPPORT = '20'
                REQUEST = BASE_URL.format(
                    text=urllib.parse.quote_plus(TEXT), 
                    confidence=CONFIDENCE, 
                    support=SUPPORT,
                    sparql=SPARQL
                )
                HEADERS = {'Accept': 'application/json'}

                #headers = {'accept': 'text/html'}
                r = requests.get(url=REQUEST, headers=HEADERS)
                #print("01.....Request r")
                if r.status_code == requests.codes['ok']:
                    #print("02.....requests.codes OK")
                    response = r.json()
                    #print(response['@text'])
                    resources = response.get('Resources', [])
                    #print(resources)
                    for res in resources:
                        print("03.....Parte de la URI y parte de surfaceForm")
                        #print(type(res))
                        all_words.append(res['@surfaceForm']) #,response['@text']
                        all_urls.append(res['@URI'])
                        all_texts.append(response['@text'])
                    df = pd.DataFrame(
                        {'Palabra': all_words,
                        'Uri': all_urls,
                        'Texto': all_texts
                        })   

                    df = df.append(df)

                    #Fichero de salida a CSV apendiza sin indice y separación @@@             
                    df.to_csv(output_csv, index = False, header=True, sep="#", mode='a')

    def get_wikilinks(self):
        return self.wikilinks_counter, self.wikilinks


from pathlib import Path
path_file = []
keras_home = '/Users/prosy/.keras/datasets/'
output_csv = 'C:/Users/Prosy/Documents/Master Ciencia de Datos/TFM/salida_ner1.csv'
def ls(ruta = Path.cwd()):
    return [arch.name for arch in Path(ruta).iterdir() if arch.is_file()]
path_file = ls(keras_home)
print("Listado de ficheros :")
print(path_file)
for path in path_file:
    input_xml = keras_home+path
    print("Comienza:" + path)
    processor = WikiLinkTextProcessor(input_xml)
    processor.set_wikilinks()

    print("Termina de leer: " + path)

enter image description here

欢迎提出任何建议,尝试过类似于递归运算的递归cte,但是希望您能多加投入。

谢谢。

2 个答案:

答案 0 :(得分:1)

您似乎正在寻找字符串连接:

select
    id,
    value,
    (
        select string_agg(trim(value), '->') within group(order by id)
        from #hierarchy h1
        where h1.id <= h.id
    ) path
from #hierarchy h

Result from terminal

id | value      | path
-: | :--------- | :----------------------------- 1 | a1 | a1
2 | b2 | a1->b2
3 | c3 | a1->b2->c3
4 | d4 | a1->b2->c3->d4
5 | e5 | a1->b2->c3->d4->e5
6 | f6 | a1->b2->c3->d4->e5->f6
7 | g7 | a1->b2->c3->d4->e5->f6->g7
8 | h8 | a1->b2->c3->d4->e5->f6->g7->h8

在不支持`string_agg()的SQL Server版本中,您可以执行以下操作:

select
    id,
    value,
    stuff(
        (
            select distinct '->' + trim(h1.value) val
            from #hierarchy h1
            where h1.id <= h.id
            order by val
            for xml path(''), type
        ).value('.', 'nvarchar(max)')
        , 1, 2, ''
    ) path
from #hierarchy h

Demo on DB Fiddle

答案 1 :(得分:0)

您似乎想要累积字符串聚合:

select string_agg(value, '->') over (order by id)
from hierarchy

但是SQL Server不支持此功能。相反,您可以使用cross apply

select h.*, h2.str
from hierarchy h cross apply
     (select string_agg(trim(value), '->') within group (order by id) as str
      from hierarchy h2
      where h2.id <= h.id
     ) h2

Here是db <>小提琴。

如果您的名称不重叠,则一次进行字符串连接然后获取所需的部分可能更方便。对于您的示例数据,这可行:

select h.*, s.str, 
       left(s.str, patindex('%' + trim(h.value) + '%', s.str) + len(trim(h.value)) - 1)
from hierarchy h cross join
     (select string_agg(trim(value), '->') within group (order by id) as str
      from hierarchy
     ) s;

(这可以与重叠的字符串一起使用,但是逻辑上有点麻烦。)

您还可以使用递归CTE:

with cte as (
      select h.id, trim(h.value) as value, convert(varchar(max), trim(h.value)) as str
      from hierarchy h
      where id = 1
      union all
      select h.id, trim(h.value) as value, cte.str + '->' + trim(h.value) as str
      from cte join
           hierarchy h
           on h.id = cte.id + 1
     )
select *
from cte;

这利用了id是连续的并且没有空格的事实。如果不能保证,可以使用row_number()来生成这样的数字。