Operatoren in Reframe (Lösungen)#

Aufgabe 7*: Operatoren der Relationalen Algebra implementieren#

Im folgenden sollen die Operatoren der relationalen Algebra für Pandas Dataframes implementiert werden.

Gehe hierbei davon aus, dass die Spalten der Dataframes benannt sind und ignoriert den Index der DataFrames.

In diesem ersten Snippet wird eine Funktion definiert, mit der du deine Lösungen später testen kannst. Du musst es dir nicht im Detail ansehen, sondern nur einmal ausführen.

import pandas as pd
from pandas import DataFrame
from typing import *
from io import StringIO
import numpy as np

df = DataFrame(
    [
        (11, 22, 34, 23),
        (11, 22, 31, 11),
        (13, 42, 16, 21),
        (55, 32, 71, 22),
        (66, 33, 19, 27),
        (13, 42, 31, 11),
    ],
    columns=list("abyz"),
)

df2 = DataFrame([(11, 22), (13, 42)], columns=list("ab"))

df3 = DataFrame(
    [
        (11, 22, 34, 23),
        (15, 22, 41, 11),
        (13, 42, 16, 21),
        (55, 32, 71, 22),
        (66, 34, 19, 27),
    ],
    columns=list("abyz"),
)

test = [
    ('select("y > 30 and z <= 20", df)', "a,b,y,z\r\n11,22,31,11\r\n13,42,31,11\r\n"),
    (
        'project(["z", "y"], df)',
        "z,y\r\n23,34\r\n11,31\r\n21,16\r\n22,71\r\n27,19\r\n11,31\r\n",
    ),
    (
        'rename(["a", "b", "c", "d"], df)',
        "a,b,c,d\r\n"
        "11,22,34,23\r\n"
        "11,22,31,11\r\n"
        "13,42,16,21\r\n"
        "55,32,71,22\r\n"
        "66,33,19,27\r\n"
        "13,42,31,11\r\n",
    ),
    (
        "cross(df, df2)",
        "L_a,L_b,y,z,R_a,R_b\r\n"
        "11,22,34,23,11,22\r\n"
        "11,22,34,23,13,42\r\n"
        "11,22,31,11,11,22\r\n"
        "11,22,31,11,13,42\r\n"
        "13,42,16,21,11,22\r\n"
        "13,42,16,21,13,42\r\n"
        "55,32,71,22,11,22\r\n"
        "55,32,71,22,13,42\r\n"
        "66,33,19,27,11,22\r\n"
        "66,33,19,27,13,42\r\n"
        "13,42,31,11,11,22\r\n"
        "13,42,31,11,13,42\r\n",
    ),
    (
        "union(df, df3)",
        "a,b,y,z\r\n"
        "11,22,34,23\r\n"
        "11,22,31,11\r\n"
        "13,42,16,21\r\n"
        "55,32,71,22\r\n"
        "66,33,19,27\r\n"
        "13,42,31,11\r\n"
        "11,22,34,23\r\n"
        "15,22,41,11\r\n"
        "13,42,16,21\r\n"
        "55,32,71,22\r\n"
        "66,34,19,27\r\n",
    ),
    ("minus(df, df3)", "a,b,y,z\r\n11,22,31,11\r\n66,33,19,27\r\n13,42,31,11\r\n"),
    (
        "intersect(df, df3)",
        "a,b,y,z\r\n"
        "11,22,34,23\r\n"
        "11,22,31,11\r\n"
        "13,42,16,21\r\n"
        "55,32,71,22\r\n"
        "66,33,19,27\r\n"
        "13,42,31,11\r\n",
    ),
    (
        "join(df, df2)",
        "a,b,y,z\r\n11,22,34,23\r\n11,22,31,11\r\n13,42,16,21\r\n13,42,31,11\r\n",
    ),
    ('join(df, df2, theta="z == R_b")', "L_a,L_b,y,z,R_a,R_b\r\n55,32,71,22,11,22\r\n"),
    ("divide(df, df2)", "y,z\r\n31,11\r\n"),
]


def validate(i):
    query, result = test[i]
    print(f"test {i}: {query}")
    expected = pd.read_csv(StringIO(result)).reset_index(drop=True)
    actual = eval(query).reset_index(drop=True)
    try:
        pd.testing.assert_frame_equal(expected, actual)
        print("OK")
    except:
        print("expected")
        print(expected)
        print("actual")
        print(actual)
        raise

Aufgabe 7.1*: Basisoperatoren#

In den folgenden Teilaufgaben implementieren Sie die Basisoperatoren der relationalen Algebra.

Aufgabe 7.1.1*: Selektion (\(\sigma\))#

Hinweis: Der Prädikatstring ist ein beliebiger boolescher Ausdruck über die Attribute einer Entity. Du kannst die Funktion eval benutzen um diesen String auszuwerten. Benutze das optionale Argument locals um die Werte der Variablen des Ausdrucks zu übergeben. Zum Beispiel:

eval("a < 10", {"a": 3})
True
def select(predicate: str, relation: DataFrame) -> DataFrame:
    # TODO: Hier code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def select(predicate: str, relation: DataFrame) -> DataFrame:
    def check_row(row):
        attributes = dict(zip(relation.columns, row))
        return eval(predicate, attributes)

    return relation[relation.apply(check_row, axis=1)]
validate(0)
test 0: select("y > 30 and z <= 20", df)
OK

Aufgabe 7.1.2*: Projektion (\(\pi\))#

def project(attributes: List[str], relation: DataFrame) -> DataFrame:
    # TODO: Hier code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def project(attributes: List[str], relation: DataFrame) -> DataFrame:
    return relation[attributes]
validate(1)
test 1: project(["z", "y"], df)
OK

Aufgabe 7.1.3*: rename (umbenennen) (\(\rho\))#

def rename(new_attribute_names: List[str], relation: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def rename(new_attribute_names: List[str], relation: DataFrame) -> DataFrame:
    return relation.rename(columns=dict(zip(relation.columns, new_attribute_names)))
validate(2)
test 2: rename(["a", "b", "c", "d"], df)
OK

Aufgabe 7.1.4*: Kreuzprodukt (\(\times\))#

def cross(left: DataFrame, right: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def cross(left: DataFrame, right: DataFrame) -> DataFrame:
    records = [
        (*l, *r) for l in left.itertuples(index=False) for r in right.itertuples(index=False)
    ]
    conflicts = set(left.columns) & set(right.columns)
    l = [f"L_{x}" if x in conflicts else x for x in left.columns]
    r = [f"R_{x}" if x in conflicts else x for x in right.columns]
    return DataFrame.from_records(records, columns=l + r)
validate(3)
test 3: cross(df, df2)
OK

Aufgabe 7.1.5*: Vereinigung ( \(\cup\))#

def union(left: DataFrame, right: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def union(left: DataFrame, right: DataFrame) -> DataFrame:
    return pd.concat([left, right])
validate(4)
test 4: union(df, df3)
OK

Aufgabe 7.1.6*: Differenz (\(-\))#

def minus(left: DataFrame, right: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return relation

Hide code cell content

#### Musterlösung
def minus(left: DataFrame, right: DataFrame) -> DataFrame:
    r = set(map(tuple, right.values))
    return DataFrame.from_records(
        [l for l in left.values if tuple(l) not in r], columns=left.columns
    )
validate(5)
test 5: minus(df, df3)
OK

Aufgabe 7.2*: Abgeleitete Operatoren#

Hinweis: Die abgeleiteten Operatoren heißen so, weil man sie von den Basisoperatoren ableiten kann.

Aufgabe 7.2.1*: Schnitt (\(\cap\))#

def intersect(left: DataFrame, right: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return left

Hide code cell content

#### Musterlösung
def intersect(left: DataFrame, right: DataFrame) -> DataFrame:
    return minus(left, minus(right, left))
validate(6)
test 6: intersect(df, df3)
OK

Aufgabe 7.2.2*: Natural Join (\(\bowtie\)), Theta Join ( \(\bowtie_{\theta}\))#

Wenn das optionale Argument theta leer ist, soll ein natural join durchgeführt werden.

def join(left: DataFrame, right: DataFrame, theta: str = "", outer: bool = False) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return left

Hide code cell content

#### Musterlösung
def join(left: DataFrame, right: DataFrame, theta: str = "", outer: bool = False) -> DataFrame:
    import itertools as it

    natural = False
    if not theta:
        natural = True
        theta = " and ".join(f"L_{key} == R_{key}" for key in left.columns if key in right.columns)
    x = select(theta, cross(left, right))
    if natural:
        to_project = []
        to_rename = []
        for i, r, o in zip(it.count(), x.columns, it.chain(left.columns, right.columns)):
            from_left = i < len(left.columns)
            if from_left or o not in left.columns:
                to_project.append(r)
                to_rename.append(o)
        x = rename(to_rename, project(to_project, x))
    return x
validate(7)
validate(8)
test 7: join(df, df2)
OK
test 8: join(df, df2, theta="z == R_b")
OK

Aufgabe 7.2.3*: Division (\(/\))#

def divide(left: DataFrame, right: DataFrame) -> DataFrame:
    # TODO: Hier Code hinzufügen.
    return left

Hide code cell content

#### Musterlösung
def divide(left: DataFrame, right: DataFrame) -> DataFrame:
    r = set(right.columns)
    X = [x for x in left.columns if x not in r]
    potential = project(X, left)
    complete = project(left.columns, cross(potential, right))
    missing = minus(complete, left)
    faulty = project(X, missing)
    return minus(potential, faulty).drop_duplicates()
validate(9)
test 9: divide(df, df2)
OK

Bei Bedarf: Alle Tests ausführen#

# Alle Lösungen mittels implementierter Funktionen generieren:
x = []
divide(df, df2)
for q, _ in test:
    print(q)
    x.append((q, eval(q).to_csv(index=False)))

from pprint import pprint

pprint(x)

# Alle Tests ausführen:
for i in range(10):
    validate(i)
select("y > 30 and z <= 20", df)
project(["z", "y"], df)
rename(["a", "b", "c", "d"], df)
cross(df, df2)
union(df, df3)
minus(df, df3)
intersect(df, df3)
join(df, df2)
join(df, df2, theta="z == R_b")
divide(df, df2)
[('select("y > 30 and z <= 20", df)', 'a,b,y,z\n11,22,31,11\n13,42,31,11\n'),
 ('project(["z", "y"], df)', 'z,y\n23,34\n11,31\n21,16\n22,71\n27,19\n11,31\n'),
 ('rename(["a", "b", "c", "d"], df)',
  'a,b,c,d\n'
  '11,22,34,23\n'
  '11,22,31,11\n'
  '13,42,16,21\n'
  '55,32,71,22\n'
  '66,33,19,27\n'
  '13,42,31,11\n'),
 ('cross(df, df2)',
  'L_a,L_b,y,z,R_a,R_b\n'
  '11,22,34,23,11,22\n'
  '11,22,34,23,13,42\n'
  '11,22,31,11,11,22\n'
  '11,22,31,11,13,42\n'
  '13,42,16,21,11,22\n'
  '13,42,16,21,13,42\n'
  '55,32,71,22,11,22\n'
  '55,32,71,22,13,42\n'
  '66,33,19,27,11,22\n'
  '66,33,19,27,13,42\n'
  '13,42,31,11,11,22\n'
  '13,42,31,11,13,42\n'),
 ('union(df, df3)',
  'a,b,y,z\n'
  '11,22,34,23\n'
  '11,22,31,11\n'
  '13,42,16,21\n'
  '55,32,71,22\n'
  '66,33,19,27\n'
  '13,42,31,11\n'
  '11,22,34,23\n'
  '15,22,41,11\n'
  '13,42,16,21\n'
  '55,32,71,22\n'
  '66,34,19,27\n'),
 ('minus(df, df3)', 'a,b,y,z\n11,22,31,11\n66,33,19,27\n13,42,31,11\n'),
 ('intersect(df, df3)',
  'a,b,y,z\n'
  '11,22,34,23\n'
  '11,22,31,11\n'
  '13,42,16,21\n'
  '55,32,71,22\n'
  '66,33,19,27\n'
  '13,42,31,11\n'),
 ('join(df, df2)',
  'a,b,y,z\n11,22,34,23\n11,22,31,11\n13,42,16,21\n13,42,31,11\n'),
 ('join(df, df2, theta="z == R_b")',
  'L_a,L_b,y,z,R_a,R_b\n55,32,71,22,11,22\n'),
 ('divide(df, df2)', 'y,z\n31,11\n')]
test 0: select("y > 30 and z <= 20", df)
OK
test 1: project(["z", "y"], df)
OK
test 2: rename(["a", "b", "c", "d"], df)
OK
test 3: cross(df, df2)
OK
test 4: union(df, df3)
OK
test 5: minus(df, df3)
OK
test 6: intersect(df, df3)
OK
test 7: join(df, df2)
OK
test 8: join(df, df2, theta="z == R_b")
OK
test 9: divide(df, df2)
OK