有効期限の異なる類似のレコードをマージするにはどうすればよいですか?


10

私が作業しているテーブルには、3つのコンポーネントがあります。

  1. IDカラム(別のテーブルの主キー)
  2. 一部のデータ列
  3. 有効な日付from/ to列。

値:

ID   Data From        To  
1    a    2015-01-01  2015-01-05
1    a    2015-01-06  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-05
2    c    2015-01-06  2015-01-10

テーブルは、一定の間隔で別のデータソースの「スナップショット」を取得し、有効期限をレコードに割り当てることによって更新されます。問題は、これらのスナップショットにより、その間隔中にまったく変更されなかった(有効期間が異なる)レコードの重複エントリが作成されることです。

連続した日付の行を探し、それらをマージして単一の有効期間を割り当てることにより、テーブルのサイズを小さくしたいと考えています。例えば:

ID   Data From        To  
1    a    2015-01-01  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-10

私が現在持っているロジックは:

  1. すべての行を選択し、ID、データフィールド、および[有効開始日]フィールドで並べ替えます(連続した行のグループに含まれるため)。
  2. カーソルを使用して、隣接する行の類似性を比較します。
  3. それらが同じである場合は、行をマージし、有効期間を変更して両方の行を含めます。

カーソルは非常に効率が悪い(データセットが大きい)ことを理解しているので、他の方法を探しています。


2
またCREATE TABLE、質問にステートメントを追加します。
ypercubeᵀᴹ

2
「大きな」データセットはどのくらいの大きさですか?スナップショットのインポートを修正して、そもそも問題を引き起こさないようにしてください。
ポールホワイト9

数百万件のレコード。テーブルの作成方法を変更する権限がありません。さらに、それは過去の記録の問題を解決しません。
hazrmard 2015

回答:


8

これが連続する範囲のみの表である場合、ケースは古典的な「ギャップとアイランド」の問題として扱うことができます。この場合、連続する範囲のアイランドを分離し、最小値[from][to]島ごとの最大。

2つのROW_NUMBER呼び出しを使用してこれを解決する確立された方法があります。

WITH islands AS
(
  SELECT
    id,
    data,
    [from],
    [to],
    island = ROW_NUMBER() OVER (PARTITION BY id       ORDER BY [from])
           - ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
  FROM
    #mergeTest
)
SELECT
  id,
  data,
  [from] = MIN([from]),
  [to]   = MAX([to])
FROM
  islands
GROUP BY
  id,
  data,
  island
;

このクエリは、SQL Server 2005と同じ低バージョンで機能します。


1

この問題を解決するクエリを記述できました。複数の結合とwhileループを使用してレコードをマージします。このコードはSQL Server 2008 R2と互換性があります。

CREATE TABLE #mergeTest
(
    [id] int NOT NULL,
    [data] date,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES     --testing null data value handling
    (1,NULL,'2015-01-01','2015-01-05'), --1
    (1,NULL,'2015-01-05','2015-01-10'), --2
    (1,'2000-01-01','2015-01-10','2015-01-14'), --3
    (1,'2000-01-03','2015-01-14','2015-01-15'), --4
    (1,'2000-01-01','2015-01-15','2015-01-20'), --5
    (1,'2000-01-01','2015-01-20','2015-01-22'), --5
    (1,'2000-01-01','2015-01-22','2015-01-25'), --6
    (1,'2000-01-01','2015-01-25','2015-01-30'), --7
    (1,NULL,'2015-01-30','2015-02-04'), --8
    (2,'2000-01-05','2015-01-01','2015-01-05'), --9
    (2,'2000-01-05','2015-01-05','2015-01-10')  --10

SELECT * FROM #mergeTest 
GO
;

SELECT * INTO #tempSingle                               --isolate single records. Single records need no processing.
    FROM (
        SELECT  [id], [data], MIN([from]) as [from], MIN([to]) as [to],
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]=1;
ALTER TABLE #tempSingle
    DROP COLUMN [grpsz];
GO
;

SELECT * INTO #tempRemainingtemp                        --isolate records w/ more than 2 entries. They need to be reduced to single records
    FROM (
        SELECT  [id], [data],                           --get [id] and [data] of duplicate records
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
    DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp                                     --get all duplicate records into #temp
    FROM (
        SELECT [b].*
        FROM #tempRemainingtemp AS [a]
        JOIN #mergeTest AS [b]
        ON      [a].[id]=[b].[id]
            AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];

DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
    FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0        --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
    FROM #tempRemaining AS t1
    JOIN #tempRemaining AS t2
    ON      t2.[to] = t1.[from]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)

SELECT t1.*, t2.[prevfrom] INTO #temp1                  --add records that did not have a previous 'to' date b/c they were the extreme records in their group
    FROM #tempRemaining AS t1
    LEFT JOIN #temp0 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp0;

SELECT t1.*, t2.[to] as [nextto] INTO #temp2            --filter in records where current 'to' date matched next 'from' date when grouped by id and data
    FROM #temp1 AS t1
    JOIN #temp1 AS t2
    ON      t2.[from] = t1.[to]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);

SELECT t1.*, t2.[nextto] INTO #temp                     --add records that did not have a next 'from' date b/c they were the extreme records in their group
    FROM #temp1 AS t1
    LEFT JOIN #temp2 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp2;
DROP TABLE #temp1;

DELETE FROM #temp                                       --delete redundant records
    WHERE   [prevfrom] IS NOT NULL
        AND [nextto] IS NOT NULL;

WITH cte AS (                                           --select records that got reduced to singles and insert them into singles account
    SELECT [id], [data], [from], [to]
        FROM [#temp]
        WHERE   [prevfrom] IS NULL
            AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle

/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/

SELECT * FROM #temp;
ALTER TABLE #temp
    DROP COLUMN [nextto],[prevfrom]                     --remove helper columns
END

SELECT TOP 1 * INTO #temptemp                           --create temporary tables for storage
    FROM #temp
SELECT TOP 1 * INTO #tempResult
    FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult

WHILE EXISTS(SELECT [id] from #temp)
BEGIN
    WITH cte AS (
            SELECT TOP 2 *                              --select pair
                FROM #temp
                ORDER BY [id],[data],[from])
        DELETE FROM cte                                 --delete from original table
        OUTPUT deleted.* INTO #temptemp;
    INSERT INTO #tempResult                             --insert merged record into result table
        SELECT t1.[id], t1.[data], t1.[from], t2.[to]
        FROM #temptemp AS t1
        JOIN #temptemp AS t2
        ON t1.[from]<t2.[from];
    TRUNCATE TABLE #temptemp;                           --empty temporary storage table
END;

TRUNCATE TABLE #mergeTest;                              --insert single records and merged records into original table
INSERT INTO #mergeTest
    SELECT * FROM #tempResult;
INSERT INTO #mergeTest
    SELECT * FROM #tempSingle;

SELECT * FROM #mergeTest
    ORDER BY [id],[from];

0

連続していても別々にしておく必要がある、連続していない日付範囲がある場合のために、私はこの解決策を思いつきました:

SQL Fiddleを参照してください

WITH lag_info AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
    lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
  FROM dat
),
segmented AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    -- new interval if non-contigous or data changed
    -- if it's null, it means that it's the first entry for the ID, which means it's a new interval
    CASE
      WHEN [PrevTo] IS NULL
        OR PrevData IS NULL
        OR DATEDIFF(DAY, [PrevTo], [From]) > 1
        OR Data <> PrevData
      THEN 1
      ELSE 0
    END AS is_new_interval
  FROM lag_info
),
segmented_marked AS (
  SELECT
    ID,
    [From],
    [To],
    Data,
    -- increment only when new data is detected, using a running sum
    sum(s.is_new_interval)
      OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
                                AS interval_id
  FROM segmented s
)
SELECT
  ID,
  min([From]) AS [From],
  max([To]) AS [To],
  Data
FROM segmented_marked
GROUP BY ID, Data, interval_id

-1

動作するように見えるクエリを書きました。共通テーブル式、MERGEステートメント、および分析関数を使用します。ただし、SQLサーバー2012以降とのみ互換性があります。ここで要点を見つけることができます:MergeRecordsByValidityDate.sql

/*  NOTE: Only works w/ SQL Server 2012+
    Merging identical records with different validity dates.
*/
USE [master]


IF OBJECT_ID('mergeTest') IS NOT NULL
    DROP TABLE mergeTest

CREATE TABLE mergeTest          -- Create table with test data
(
    [id] int NOT NULL,
    [data] char(1) NOT NULL,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO mergeTest ([id],[data],[from],[to]) VALUES      -- Insert records w/ different validity dates
    (1,'a','2015-01-01','2015-01-05'),  --1
    (1,'a','2015-01-05','2015-01-10'),  --2
    (1,'a','2015-01-10','2015-01-14'),  --3
    (1,'b','2015-01-14','2015-01-15'),  --4
    (1,'a','2015-01-15','2015-01-20'),  --5
    (1,'a','2015-01-20','2015-01-25'),  --6
    (1,'a','2015-01-25','2015-01-30'),  --7
    (1,'a','2015-01-30','2015-02-04'),  --8
    (2,'c','2015-01-01','2015-01-05'),  --9
    (2,'c','2015-01-05','2015-01-10')   --10

SELECT * FROM mergeTest

/*  This SELECT function uses a Common Table Expression along with Analytic functions over a partition.
    The data set is partitioned on similar primary key and data columns and ordered by 'from' dates.
    A 'last' and 'next' column is added with 'to' date of prev row and 'from' date of next row.
    For each partition, rows are selected (for each partition) that represent the first and last records 
    of identical data. For e.g. rows 5,6,7,8 are reduced to 5,8.
*/

;WITH partitionedData AS (
    SELECT *,   LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM mergeTest)
SELECT [id],[data],[from],[to],[last],[next] INTO #temp
    FROM partitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temp

/*  Now all redundant 'sandwiched' records have been filtered out, only the extreme records are left.
    This MERGE function matches rows on primary key and data, and If the 'to' date of said record matches
    'from' date of another similar record, then the said record is extended to encapsulate the other record's
    'to' date. For example row 5's 'to' date is extended to equal row 8's 'to' date.
*/

MERGE INTO #temp as m1
    USING #temp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[to]=m2.[from])
    THEN
    UPDATE SET  m1.[to]=m2.[to]
;

SELECT * FROM #temp

/*  The MERGE function has done its job of extending records. However there are still 2 records with
    identical data. For e.g. rows 9,10 exist even though row 9 now has all the required information. This 
    block modifies such redundant rows so their 'last' and 'from' columns become asynchronous.
*/

;WITH repartitionedData AS (
    SELECT [id],[data],[from],[to], LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM #temp)
SELECT [id],[data],[from],[to],[last],[next] INTO #temptemp
    FROM repartitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temptemp

/* Asynchronous rows are deleted
*/

DELETE FROM #temptemp
    WHERE [from]<[last]

SELECT * FROM #temptemp

/*  However, blocks of data with >2 rows (like rows 5 through 8) could not be merged because of the filtered out
    rows (i.e. rows 6,7). Applying MERGE again on the updated data set.
*/

MERGE INTO #temptemp as m1
    USING #temptemp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[from]=m2.[next])
    THEN
    UPDATE SET  m1.[from]=m2.[from],
                m1.[last]=CASE WHEN ((m2.[last] IS NULL) OR (m2.[next] IS NULL)) THEN NULL ELSE m1.[last] END   --if row absorbing from is extreme, then current row is also extreme
;

SELECT * FROM #temptemp

TRUNCATE TABLE mergeTest        -- resetting original table

/* The MERGE corrected all rows with the correct 'from' and 'to' dates. And the only rows we are interested in are
    the extreme rows i.e. with 'last' or 'next' == NULL. SELECTing on that criterion and INSERTing into original table.
*/

INSERT INTO mergeTest           -- inserting processed records into table + some last minute filtering
    SELECT [id],[data],[from],MAX([to])
    FROM #temptemp
        WHERE [next] IS NULL OR [last] IS NULL
    GROUP BY [id],[data],[from]

SELECT * FROM mergeTest

DROP TABLE #temp
DROP TABLE #temptemp
弊社のサイトを使用することにより、あなたは弊社のクッキーポリシーおよびプライバシーポリシーを読み、理解したものとみなされます。
Licensed under cc by-sa 3.0 with attribution required.